add log print module

f1426f03 · liuruilong · 0c9279af · ffb6447c · f1426f03 · f1426f03
82 changed file
--- a/.clang-format
+++ b/.clang-format
+---
+Language:        Cpp
+BasedOnStyle:  LLVM
+Standard:  Cpp11 
+IndentWidth: 4
+NamespaceIndentation: All
+...
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,6 +6,7 @@ repos:
        files: (src).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
    -   id: remove-tabs
        files: (src).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
+
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
    hooks:
@@ -18,11 +19,21 @@ repos:
        files: (src).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
    -   id: trailing-whitespace
        files: (src).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
+
 -   repo: local
    hooks:
    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
-        entry: bash .clang_format.hook -i
+        entry: bash ./tools/pre-commit.hooks/.clang_format.hook -i
        language: system
        files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
+
+#-   repo: local
+#    hooks:
+#    -   id: copyright_checker
+#        name: copyright_checker
+#        entry: python ./tools/pre-commit.hooks/.copyright.hook
+#        language: system
+#        files: (src).*\.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+#        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,8 @@ target_link_libraries(paddle-mobile-static protobuf-lite openblas)
 add_dependencies(paddle-mobile openblas_proj)

 # gen test
-ADD_EXECUTABLE(paddle-mobile-test test/main.cpp test/test_helper.h)
+ADD_EXECUTABLE(paddle-mobile-test test/main.cpp test/test_helper.h
+        test/elementwise_add_op_test.h test/test_include.h)
 target_link_libraries(paddle-mobile-test paddle-mobile)

 # gen test log

--- a/README.md
+++ b/README.md
-# Paddle-Mobile
+# Paddle-Mobile 
+
+![License MIT](https://img.shields.io/github/license/mashape/apistatus.svg)  [![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
+
+

 This project is used to develop the next version deep learning freamwork for mobile device.


--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -23,30 +23,31 @@ SOFTWARE.

 namespace paddle_mobile {

-namespace framework {
-template <typename Dtype> class OperatorBase;
-class OpDesc;
-class BlockDesc;
-class InferShapeContext;
-}
-
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-template <typename Dtype>
-using OpCreator = std::function<framework::OperatorBase<Dtype> *(
-    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
-    const VariableNameMap & /*outputs*/,
-    const framework::AttributeMap & /*attrs*/)>;
-
-using GradOpMakerFN =
-    std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
-        const framework::OpDesc &,
-        const std::unordered_set<std::string> & /*no_grad_set*/,
-        std::unordered_map<std::string, std::string> * /*grad_to_var*/,
-        const std::vector<framework::BlockDesc *> &grad_block)>;
-
-using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
-                                          framework::BlockDesc * /*block*/)>;
-
-using InferShapeFN = std::function<void(framework::InferShapeContext *)>;
+    namespace framework {
+        template <typename Dtype> class OperatorBase;
+        class OpDesc;
+        class BlockDesc;
+        class InferShapeContext;
+    }
+
+    using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+    template <typename Dtype>
+    using OpCreator = std::function<framework::OperatorBase<Dtype> *(
+        const std::string & /*type*/, const VariableNameMap & /*inputs*/,
+        const VariableNameMap & /*outputs*/,
+        const framework::AttributeMap & /*attrs*/)>;
+
+    using GradOpMakerFN =
+        std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
+            const framework::OpDesc &,
+            const std::unordered_set<std::string> & /*no_grad_set*/,
+            std::unordered_map<std::string, std::string> * /*grad_to_var*/,
+            const std::vector<framework::BlockDesc *> &grad_block)>;
+
+    using InferVarTypeFN =
+        std::function<void(const framework::OpDesc & /*op_desc*/,
+                           framework::BlockDesc * /*block*/)>;
+
+    using InferShapeFN = std::function<void(framework::InferShapeContext *)>;
 };
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -19,45 +19,45 @@ SOFTWARE.
 #pragma once;

 namespace paddle_mobile {
-enum class Precision : int { FP32 = 0 };
+    enum class Precision : int { FP32 = 0 };

-//! device type
-enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+    //! device type
+    enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };

-template <DeviceTypeEnum T> struct DeviceType {};
+    template <DeviceTypeEnum T> struct DeviceType {};

-typedef DeviceType<kCPU> CPU;
-typedef DeviceType<kFPGA> FPGA;
-typedef DeviceType<kGPU_MALI> GPU_MALI;
+    typedef DeviceType<kCPU> CPU;
+    typedef DeviceType<kFPGA> FPGA;
+    typedef DeviceType<kGPU_MALI> GPU_MALI;

-//! data type
-enum DataType {
-  PM_INVALID = -1,
-  PM_HALF = 0,
-  PM_FLOAT = 1,
-  PM_DOUBLE = 2,
-  PM_INT8 = 3,
-  PM_INT16 = 4,
-  PM_INT32 = 5,
-  PM_INT64 = 6,
-  PM_UINT8 = 7,
-  PM_UINT16 = 8,
-  PM_UINT32 = 9,
-  PM_STRING = 10,
-  PM_BOOL = 11,
-  PM_SHAPE = 12,
-  PM_TENSOR = 13
-};
-//!
-enum PMStatus {
-  PMSuccess = 0xFF,        /*!< No errors */
-  PMNotInitialized = 0x01, /*!< Data not initialized. */
-  PMInvalidValue = 0x02,   /*!< Incorrect variable value. */
-  PMMemAllocFailed = 0x03, /*!< Memory allocation error. */
-  PMUnKownError = 0x04,    /*!< Unknown error. */
-  PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/
-  PMOutOfMem = 0x06,       /*!< OOM error*/
-  PMUnImplError = 0x07,    /*!< Unimplement error. */
-  PMWrongDevice = 0x08     /*!< un-correct device. */
-};
+    //! data type
+    enum DataType {
+        PM_INVALID = -1,
+        PM_HALF = 0,
+        PM_FLOAT = 1,
+        PM_DOUBLE = 2,
+        PM_INT8 = 3,
+        PM_INT16 = 4,
+        PM_INT32 = 5,
+        PM_INT64 = 6,
+        PM_UINT8 = 7,
+        PM_UINT16 = 8,
+        PM_UINT32 = 9,
+        PM_STRING = 10,
+        PM_BOOL = 11,
+        PM_SHAPE = 12,
+        PM_TENSOR = 13
+    };
+    //!
+    enum PMStatus {
+        PMSuccess = 0xFF,        /*!< No errors */
+        PMNotInitialized = 0x01, /*!< Data not initialized. */
+        PMInvalidValue = 0x02,   /*!< Incorrect variable value. */
+        PMMemAllocFailed = 0x03, /*!< Memory allocation error. */
+        PMUnKownError = 0x04,    /*!< Unknown error. */
+        PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/
+        PMOutOfMem = 0x06,       /*!< OOM error*/
+        PMUnImplError = 0x07,    /*!< Unimplement error. */
+        PMWrongDevice = 0x08     /*!< un-correct device. */
+    };
 }
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -21,79 +21,79 @@ SOFTWARE.
 #pragma once

 namespace paddle_mobile {
-template <int ID, typename Type> struct IDToType { typedef Type type_t; };
+    template <int ID, typename Type> struct IDToType { typedef Type type_t; };

-template <typename F, typename... Ts> struct VariantHelper {
-  static const size_t size = sizeof(F) > VariantHelper<Ts...>::size
-                                 ? sizeof(F)
-                                 : VariantHelper<Ts...>::size;
+    template <typename F, typename... Ts> struct VariantHelper {
+        static const size_t size = sizeof(F) > VariantHelper<Ts...>::size
+                                       ? sizeof(F)
+                                       : VariantHelper<Ts...>::size;

-  inline static void Destroy(size_t id, void *data) {
-    if (id == typeid(F).hash_code()) {
-      reinterpret_cast<F *>(data)->~F();
-    } else {
-      VariantHelper<Ts...>::Destroy(id, data);
-    }
-  }
-};
+        inline static void Destroy(size_t id, void *data) {
+            if (id == typeid(F).hash_code()) {
+                reinterpret_cast<F *>(data)->~F();
+            } else {
+                VariantHelper<Ts...>::Destroy(id, data);
+            }
+        }
+    };

-template <typename F> struct VariantHelper<F> {
-  static const size_t size = sizeof(F);
-  inline static void Destroy(size_t id, void *data) {
-    if (id == typeid(F).hash_code()) {
-      //              reinterpret_cast<F*>(data)->~F();
-    } else {
-      //              std::cout << "未匹配到 " << std::endl;
-    }
-  }
-};
+    template <typename F> struct VariantHelper<F> {
+        static const size_t size = sizeof(F);
+        inline static void Destroy(size_t id, void *data) {
+            if (id == typeid(F).hash_code()) {
+                //              reinterpret_cast<F*>(data)->~F();
+            } else {
+                //              std::cout << "未匹配到 " << std::endl;
+            }
+        }
+    };

-template <size_t size> class RawData {
-public:
-  char data[size];
-  RawData() {}
-  RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
-  //      void operator=(const RawData &raw_data){
-  //        strcpy(data, raw_data.data);
-  //      }
-};
+    template <size_t size> class RawData {
+      public:
+        char data[size];
+        RawData() {}
+        RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
+        //      void operator=(const RawData &raw_data){
+        //        strcpy(data, raw_data.data);
+        //      }
+    };

-template <typename... Ts> struct Variant {
-  Variant(const Variant &variant) {
-    //        std::cout << " 赋值构造函数 " << std::endl;
-    type_id = variant.type_id;
-    data = variant.data;
-  }
+    template <typename... Ts> struct Variant {
+        Variant(const Variant &variant) {
+            //        std::cout << " 赋值构造函数 " << std::endl;
+            type_id = variant.type_id;
+            data = variant.data;
+        }

-  Variant() : type_id(invalid_type()) {}
-  ~Variant() {
-    //        helper::Destroy(type_id, &data);
-  }
+        Variant() : type_id(invalid_type()) {}
+        ~Variant() {
+            //        helper::Destroy(type_id, &data);
+        }

-  template <typename T, typename... Args> void Set(Args &&... args) {
-    helper::Destroy(type_id, &data);
-    new (&data) T(std::forward<Args>(args)...);
-    type_id = typeid(T).hash_code();
-  }
+        template <typename T, typename... Args> void Set(Args &&... args) {
+            helper::Destroy(type_id, &data);
+            new (&data) T(std::forward<Args>(args)...);
+            type_id = typeid(T).hash_code();
+        }

-  template <typename T> T &Get() const {
-    if (type_id == typeid(T).hash_code()) {
-      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
-    } else {
-      //      std::cout << " bad cast in variant " << std::endl;
-      throw std::bad_cast();
-    }
-  }
+        template <typename T> T &Get() const {
+            if (type_id == typeid(T).hash_code()) {
+                return *const_cast<T *>(reinterpret_cast<const T *>(&data));
+            } else {
+                //      std::cout << " bad cast in variant " << std::endl;
+                throw std::bad_cast();
+            }
+        }

-  size_t TypeId() const { return type_id; }
+        size_t TypeId() const { return type_id; }

-private:
-  static inline size_t invalid_type() { return typeid(void).hash_code(); }
-  typedef VariantHelper<Ts...> helper;
-  size_t type_id;
-  RawData<helper::size> data;
-};
+      private:
+        static inline size_t invalid_type() { return typeid(void).hash_code(); }
+        typedef VariantHelper<Ts...> helper;
+        size_t type_id;
+        RawData<helper::size> data;
+    };

-template <typename T> struct Vistor { typedef T type_t; };
+    template <typename T> struct Vistor { typedef T type_t; };

 } // namespace paddle_mobile
--- a/src/framework/attribute.cpp
+++ b/src/framework/attribute.cpp
@@ -19,5 +19,5 @@ SOFTWARE.
 #include "attribute.h"

 namespace paddle_mobile {
-namespace framework {}
+    namespace framework {}
 } // namespace paddle_mobile
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -22,107 +22,110 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-class BlockDesc;
+        class BlockDesc;

-class Attribute {
-public:
-  static Attribute GetAttrValue(const proto::OpDesc::Attr &attr_desc) {
-    //    std::cout << "begin get attr value" << std::endl;
-    Attribute attr;
-    switch (attr_desc.type()) {
-    case proto::AttrType::BOOLEAN: {
-      attr.Set<bool>(attr_desc.b());
-      break;
-    }
-    case proto::AttrType::INT: {
-      attr.Set<int>(attr_desc.i());
-      break;
-    }
-    case proto::AttrType::FLOAT: {
-      attr.Set<float>(attr_desc.f());
-      break;
-    }
-    case proto::AttrType::STRING: {
-      attr.Set<std::string>(attr_desc.s());
-      break;
-    }
-    case proto::AttrType::BOOLEANS: {
-      std::vector<bool> val(attr_desc.bools_size());
-      for (int i = 0; i < attr_desc.bools_size(); ++i) {
-        val[i] = attr_desc.bools(i);
-      }
-      attr.Set<std::vector<bool>>(val);
-      break;
-    }
-    case proto::AttrType::INTS: {
-      std::vector<int> val(attr_desc.ints_size());
-      for (int i = 0; i < attr_desc.ints_size(); ++i) {
-        val[i] = attr_desc.ints(i);
-      }
-      attr.Set<std::vector<int>>(val);
-      break;
-    }
-    case proto::AttrType::FLOATS: {
-      std::vector<float> val(attr_desc.floats_size());
-      for (int i = 0; i < attr_desc.floats_size(); ++i) {
-        val[i] = attr_desc.floats(i);
-      }
-      attr.Set<std::vector<float>>(val);
-      break;
-    }
-    case proto::AttrType::STRINGS: {
-      std::vector<std::string> val(attr_desc.strings_size());
-      for (int i = 0; i < attr_desc.strings_size(); ++i) {
-        val[i] = attr_desc.strings(i);
-      }
-      attr.Set<std::vector<std::string>>(val);
-      break;
-    }
-    case proto::AttrType::LONG: {
-      attr.Set<int64_t>(attr_desc.l());
-      break;
-    }
-    default:
-      //        std::cout << " not support " << std::endl;
-      break;
-    }
-    //    std::cout << "end get attr value" << std::endl;
-    return attr;
-  }
+        class Attribute {
+          public:
+            static Attribute
+            GetAttrValue(const proto::OpDesc::Attr &attr_desc) {
+                //    std::cout << "begin get attr value" << std::endl;
+                Attribute attr;
+                switch (attr_desc.type()) {
+                case proto::AttrType::BOOLEAN: {
+                    attr.Set<bool>(attr_desc.b());
+                    break;
+                }
+                case proto::AttrType::INT: {
+                    attr.Set<int>(attr_desc.i());
+                    break;
+                }
+                case proto::AttrType::FLOAT: {
+                    attr.Set<float>(attr_desc.f());
+                    break;
+                }
+                case proto::AttrType::STRING: {
+                    attr.Set<std::string>(attr_desc.s());
+                    break;
+                }
+                case proto::AttrType::BOOLEANS: {
+                    std::vector<bool> val(attr_desc.bools_size());
+                    for (int i = 0; i < attr_desc.bools_size(); ++i) {
+                        val[i] = attr_desc.bools(i);
+                    }
+                    attr.Set<std::vector<bool>>(val);
+                    break;
+                }
+                case proto::AttrType::INTS: {
+                    std::vector<int> val(attr_desc.ints_size());
+                    for (int i = 0; i < attr_desc.ints_size(); ++i) {
+                        val[i] = attr_desc.ints(i);
+                    }
+                    attr.Set<std::vector<int>>(val);
+                    break;
+                }
+                case proto::AttrType::FLOATS: {
+                    std::vector<float> val(attr_desc.floats_size());
+                    for (int i = 0; i < attr_desc.floats_size(); ++i) {
+                        val[i] = attr_desc.floats(i);
+                    }
+                    attr.Set<std::vector<float>>(val);
+                    break;
+                }
+                case proto::AttrType::STRINGS: {
+                    std::vector<std::string> val(attr_desc.strings_size());
+                    for (int i = 0; i < attr_desc.strings_size(); ++i) {
+                        val[i] = attr_desc.strings(i);
+                    }
+                    attr.Set<std::vector<std::string>>(val);
+                    break;
+                }
+                case proto::AttrType::LONG: {
+                    attr.Set<int64_t>(attr_desc.l());
+                    break;
+                }
+                default:
+                    //        std::cout << " not support " << std::endl;
+                    break;
+                }
+                //    std::cout << "end get attr value" << std::endl;
+                return attr;
+            }

-  Attribute() {}
-  template <typename T, typename... Args> Attribute &Set(Args &&... args) {
-    variant_.Set<T>(args...);
-    return *this;
-  }
+            Attribute() {}
+            template <typename T, typename... Args>
+            Attribute &Set(Args &&... args) {
+                variant_.Set<T>(args...);
+                return *this;
+            }

-  template <typename T> T &Get() const { return variant_.Get<T>(); }
+            template <typename T> T &Get() const { return variant_.Get<T>(); }

-private:
-  Variant<int, float, std::string, std::vector<int>, std::vector<float>,
-          std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
-          int64_t>
-      variant_;
-};
+          private:
+            Variant<int, float, std::string, std::vector<int>,
+                    std::vector<float>, std::vector<std::string>, bool,
+                    std::vector<bool>, BlockDesc *, int64_t>
+                variant_;
+        };

-using AttributeMap = std::unordered_map<std::string, Attribute>;
+        using AttributeMap = std::unordered_map<std::string, Attribute>;

-class AttrReader {
-public:
-  explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
+        class AttrReader {
+          public:
+            explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}

-  template <typename T> inline T Get(const std::string &name) const {
-    //          PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in
-    //          AttributeMap",
-    //                         name);
-    return ((Attribute)attrs_.at(name)).Get<T>();
-  }
+            template <typename T> inline T Get(const std::string &name) const {
+                //          PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should
+                //          be in
+                //          AttributeMap",
+                //                         name);
+                return ((Attribute)attrs_.at(name)).Get<T>();
+            }

-private:
-  const AttributeMap &attrs_;
-};
+          private:
+            const AttributeMap &attrs_;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/block_desc.cpp
+++ b/src/framework/block_desc.cpp
@@ -19,32 +19,32 @@ SOFTWARE.
 #include "block_desc.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
-  std::vector<std::shared_ptr<VarDesc>> res;
-  for (const auto &p : vars_) {
-    res.push_back(p.second);
-  }
-  return res;
-}
+        std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
+            std::vector<std::shared_ptr<VarDesc>> res;
+            for (const auto &p : vars_) {
+                res.push_back(p.second);
+            }
+            return res;
+        }

-std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
-  std::vector<std::shared_ptr<OpDesc>> res;
-  for (const auto &op : ops_) {
-    res.push_back(op);
-  }
-  return res;
-}
+        std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
+            std::vector<std::shared_ptr<OpDesc>> res;
+            for (const auto &op : ops_) {
+                res.push_back(op);
+            }
+            return res;
+        }

-BlockDesc::BlockDesc(const proto::BlockDesc &desc) : desc_(desc) {
-  for (const proto::VarDesc &var_desc : desc_.vars()) {
-    vars_[var_desc.name()].reset(new VarDesc(var_desc));
-  }
-  for (const proto::OpDesc &op_desc : desc_.ops()) {
-    ops_.emplace_back(new framework::OpDesc(op_desc));
-  }
-}
+        BlockDesc::BlockDesc(const proto::BlockDesc &desc) : desc_(desc) {
+            for (const proto::VarDesc &var_desc : desc_.vars()) {
+                vars_[var_desc.name()].reset(new VarDesc(var_desc));
+            }
+            for (const proto::OpDesc &op_desc : desc_.ops()) {
+                ops_.emplace_back(new framework::OpDesc(op_desc));
+            }
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/block_desc.h
+++ b/src/framework/block_desc.h
@@ -24,46 +24,50 @@ SOFTWARE.
 #include "var_desc.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-class BlockDesc : PaddleMobileObject {
-public:
-  BlockDesc(const proto::BlockDesc &desc);
+        class BlockDesc : PaddleMobileObject {
+          public:
+            BlockDesc(const proto::BlockDesc &desc);

-  const int &ID() const { return desc_.idx(); }
+            const int &ID() const { return desc_.idx(); }

-  const int &Parent() const { return desc_.parent_idx(); }
+            const int &Parent() const { return desc_.parent_idx(); }

-  bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
-  }
+            bool operator==(
+                const paddle_mobile::framework::BlockDesc &in_block) const {
+                return this->ID() == in_block.ID() &&
+                       this->Parent() == in_block.Parent();
+            }

-  bool operator<(const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
-  }
+            bool operator<(
+                const paddle_mobile::framework::BlockDesc &in_block) const {
+                return this->ID() < in_block.ID() &&
+                       this->Parent() < in_block.Parent();
+            }

-  std::vector<std::shared_ptr<VarDesc>> Vars() const;
-  std::vector<std::shared_ptr<OpDesc>> Ops() const;
+            std::vector<std::shared_ptr<VarDesc>> Vars() const;
+            std::vector<std::shared_ptr<OpDesc>> Ops() const;

-private:
-  proto::BlockDesc desc_;
-  std::vector<std::shared_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
-};
+          private:
+            proto::BlockDesc desc_;
+            std::vector<std::shared_ptr<OpDesc>> ops_;
+            std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile

 namespace std {

-template <> struct hash<paddle_mobile::framework::BlockDesc> {
-  typedef paddle_mobile::framework::BlockDesc argument_type;
-  typedef std::size_t result_type;
-  result_type operator()(argument_type const &s) const noexcept {
-    result_type const h1(std::hash<int>{}(s.ID()));
-    result_type const h2(std::hash<int>{}(s.ID()));
-    return h1 ^ (h2 << 1);
-  }
-};
+    template <> struct hash<paddle_mobile::framework::BlockDesc> {
+        typedef paddle_mobile::framework::BlockDesc argument_type;
+        typedef std::size_t result_type;
+        result_type operator()(argument_type const &s) const noexcept {
+            result_type const h1(std::hash<int>{}(s.ID()));
+            result_type const h2(std::hash<int>{}(s.ID()));
+            return h1 ^ (h2 << 1);
+        }
+    };

 } // namespace std
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -19,49 +19,50 @@ limitations under the License. */
 #include <string>

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
-};
+        enum class DataLayout {
+            kNHWC = 0,
+            kNCHW = 1,
+            kAnyLayout = 2,
+        };

-inline DataLayout StringToDataLayout(const std::string &str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
+        inline DataLayout StringToDataLayout(const std::string &str) {
+            std::string s(str);
+            for (size_t i = 0; i < s.size(); ++i) {
+                s[i] = toupper(s[i]);
+            }

-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else {
-    //    std::cout << "Unknown storage order string: %s", s;
-  }
-}
+            if (s == "NHWC") {
+                return DataLayout::kNHWC;
+            } else if (s == "NCHW") {
+                return DataLayout::kNCHW;
+            } else if (s == "ANYLAYOUT") {
+                return DataLayout::kAnyLayout;
+            } else {
+                //    std::cout << "Unknown storage order string: %s", s;
+            }
+        }

-inline std::string DataLayoutToString(const DataLayout &data_layout) {
-  switch (data_layout) {
-  case DataLayout::kNHWC:
-    return "NHWC";
-  case DataLayout::kNCHW:
-    return "NCHW";
-  case DataLayout::kAnyLayout:
-    return "ANY_LAYOUT";
-  default:
-    break;
-    //      std::cout << "unknown DataLayou %d", data_layout;
-  }
-}
+        inline std::string DataLayoutToString(const DataLayout &data_layout) {
+            switch (data_layout) {
+            case DataLayout::kNHWC:
+                return "NHWC";
+            case DataLayout::kNCHW:
+                return "NCHW";
+            case DataLayout::kAnyLayout:
+                return "ANY_LAYOUT";
+            default:
+                break;
+                //      std::cout << "unknown DataLayou %d", data_layout;
+            }
+        }

-inline std::ostream &operator<<(std::ostream &out, const DataLayout &l) {
-  out << DataLayoutToString(l);
-  return out;
-}
+        inline std::ostream &operator<<(std::ostream &out,
+                                        const DataLayout &l) {
+            out << DataLayoutToString(l);
+            return out;
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/data_transform.cpp
+++ b/src/framework/data_transform.cpp
@@ -21,67 +21,72 @@ SOFTWARE.
 #include "data_transform.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-static void PassTensorData(Tensor *from, Tensor *to) {
-  to->ShareDataWith(*from);
-  *from = Tensor();
-}
+        static void PassTensorData(Tensor *from, Tensor *to) {
+            to->ShareDataWith(*from);
+            *from = Tensor();
+        }

-void DataTransform(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *output_tensor) {
-  bool transformed = false;
-  Tensor in;
-  in.ShareDataWith(input_tensor);
-  Tensor out;
+        void DataTransform(const OpKernelType &expected_kernel_type,
+                           const OpKernelType &kernel_type_for_var,
+                           const Tensor &input_tensor, Tensor *output_tensor) {
+            bool transformed = false;
+            Tensor in;
+            in.ShareDataWith(input_tensor);
+            Tensor out;

-  //  // do layout transform
-  //  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-  //                          kernel_type_for_var.data_layout_)) {
-  //    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do data type transform
-  //  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
-  //    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do device transform
-  //  if (!platform::is_same_place(kernel_type_for_var.place_,
-  //                               expected_kernel_type.place_)) {
-  //    TransDataDevice(in, expected_kernel_type.place_, &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
-  // get output data
-  output_tensor->ShareDataWith(in);
-}
+            //  // do layout transform
+            //  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+            //                          kernel_type_for_var.data_layout_)) {
+            //    TransDataLayout(kernel_type_for_var, expected_kernel_type, in,
+            //    &out);
+            //    transformed = true;
+            //    PassTensorData(&out, &in);
+            //  }
+            //
+            //  // do data type transform
+            //  if (expected_kernel_type.data_type_ !=
+            //  kernel_type_for_var.data_type_) {
+            //    TransDataType(kernel_type_for_var, expected_kernel_type, in,
+            //    &out);
+            //    transformed = true;
+            //    PassTensorData(&out, &in);
+            //  }
+            //
+            //  // do device transform
+            //  if (!platform::is_same_place(kernel_type_for_var.place_,
+            //                               expected_kernel_type.place_)) {
+            //    TransDataDevice(in, expected_kernel_type.place_, &out);
+            //    transformed = true;
+            //    PassTensorData(&out, &in);
+            //  }
+            //
+            //  PADDLE_ENFORCE(transformed, "No transform is applied, please
+            //  check!");
+            // get output data
+            output_tensor->ShareDataWith(in);
+        }

-void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
-                            Variable &out_var) {
-  //  if (in_var.IsType<LoDTensor>()) {
-  //    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-  //    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
-  //    tran_lod_tensor->set_lod(in_lod_tensor.lod());
-  //    tran_lod_tensor->set_layout(in_lod_tensor.layout());
-  //    tran_lod_tensor->ShareDataWith(tensor);
-  //  } else if (in_var.IsType<SelectedRows>()) {
-  //    auto& in_selected_rows = in_var.Get<SelectedRows>();
-  //    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
-  //    trans_selected_rows->set_height(in_selected_rows.height());
-  //    trans_selected_rows->set_rows(in_selected_rows.rows());
-  //    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
-  //  } else {
-  //    PADDLE_THROW("unknown var type");
-  //  }
-}
+        void CopyVariableWithTensor(const Variable &in_var,
+                                    const Tensor &tensor, Variable &out_var) {
+            //  if (in_var.IsType<LoDTensor>()) {
+            //    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+            //    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+            //    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+            //    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+            //    tran_lod_tensor->ShareDataWith(tensor);
+            //  } else if (in_var.IsType<SelectedRows>()) {
+            //    auto& in_selected_rows = in_var.Get<SelectedRows>();
+            //    auto* trans_selected_rows =
+            //    out_var.GetMutable<SelectedRows>();
+            //    trans_selected_rows->set_height(in_selected_rows.height());
+            //    trans_selected_rows->set_rows(in_selected_rows.rows());
+            //    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+            //  } else {
+            //    PADDLE_THROW("unknown var type");
+            //  }
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/data_transform.h
+++ b/src/framework/data_transform.h
@@ -28,14 +28,14 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-void DataTransform(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *out);
+        void DataTransform(const OpKernelType &expected_kernel_type,
+                           const OpKernelType &kernel_type_for_var,
+                           const Tensor &input_tensor, Tensor *out);

-void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
-                            Variable &out_var);
+        void CopyVariableWithTensor(const Variable &in_var,
+                                    const Tensor &tensor, Variable &out_var);

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/data_type.h
+++ b/src/framework/data_type.h
@@ -21,23 +21,23 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-//    inline proto::VarType::Type ToDataType(std::type_index type) {
-//        using namespace paddle_mobile::framework::proto;
-//        if (typeid(float).hash_code() == type.hash_code()) {
-//            return proto::VarType::FP32;
-//        } else if (typeid(double).hash_code() == type.hash_code()) {
-//            return proto::VarType::FP64;
-//        } else if (typeid(int).hash_code() == type.hash_code()) {
-//            return proto::VarType::INT32;
-//        } else if (typeid(int64_t).hash_code() == type.hash_code()) {
-//            return proto::VarType::INT64;
-//        } else if (typeid(bool).hash_code() == type.hash_code()) {
-//            return proto::VarType::BOOL;
-//        } else {
-////            PADDLE_THROW("Not supported");
-//        }
-//    }
-}
+        //    inline proto::VarType::Type ToDataType(std::type_index type) {
+        //        using namespace paddle_mobile::framework::proto;
+        //        if (typeid(float).hash_code() == type.hash_code()) {
+        //            return proto::VarType::FP32;
+        //        } else if (typeid(double).hash_code() == type.hash_code()) {
+        //            return proto::VarType::FP64;
+        //        } else if (typeid(int).hash_code() == type.hash_code()) {
+        //            return proto::VarType::INT32;
+        //        } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+        //            return proto::VarType::INT64;
+        //        } else if (typeid(bool).hash_code() == type.hash_code()) {
+        //            return proto::VarType::BOOL;
+        //        } else {
+        ////            PADDLE_THROW("Not supported");
+        //        }
+        //    }
+    }
 } // namespace paddle_mobile
--- a/src/framework/ddim.cc
+++ b/src/framework/ddim.cc
@@ -15,311 +15,320 @@ limitations under the License. */
 #include "ddim.h"

 namespace paddle_mobile {
-namespace framework {
-
-/// @cond HIDDEN
-
-template <int i> Dim<i> make_dim(const int64_t *d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <> Dim<0> make_dim<0>(const int64_t *d) { return Dim<0>(*d); }
-
-void make_ddim(DDim &ddim, const int64_t *dims, int n) {
-  switch (n) {
-  case 0:
-    ddim = make_dim<0>(dims);
-    break;
-  case 1:
-    ddim = make_dim<1>(dims);
-    break;
-  case 2:
-    ddim = make_dim<2>(dims);
-    break;
-  case 3:
-    ddim = make_dim<3>(dims);
-    break;
-  case 4:
-    ddim = make_dim<4>(dims);
-    break;
-  case 5:
-    ddim = make_dim<5>(dims);
-    break;
-  case 6:
-    ddim = make_dim<6>(dims);
-    break;
-  case 7:
-    ddim = make_dim<7>(dims);
-    break;
-  case 8:
-    ddim = make_dim<8>(dims);
-    break;
-  case 9:
-    ddim = make_dim<9>(dims);
-    break;
-  default:
-    //      std::cout << "Dynamic dimensions must have between [1, 9]
-    //      dimensions.";
-    break;
-  }
-}
-
-/// @endcond
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int64_t> &dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int> &dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
-}
-
-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes errors
-struct DynamicMutableIndexer : Vistor<int64_t &> {
-public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
-
-  template <int D> int64_t &operator()(Dim<D> &dim) const { return dim[idx_]; }
-
-private:
-  int idx_;
-};
-
-struct DynamicConstIndexer : public Vistor<int64_t> {
-public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D> int64_t operator()(const Dim<D> &dim) const {
-    return dim[idx_];
-  }
-
-private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t &DDim::operator[](int idx) {
-  return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this);
-}
-
-int64_t DDim::operator[](int idx) const {
-  return DDim::ApplyVistor(DynamicConstIndexer(idx), *this);
-}
-
-int DDim::size() const { return arity(*this); }
-
-bool DDim::operator==(DDim d) const {
-  //  if (var.which() != d.getVar().which()) {
-  //    return false;
-  //  } else {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    if (v1[i] != v2[i]) {
-      return false;
-    }
-  }
-
-  return true;
-  //  }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
-
-void set(DDim &ddim, int idx, int value) { ddim[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : Vistor<void> {
-  std::vector<int64_t> &vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t> &v) : vector(v) {}
-
-  template <typename T> void operator()(const T &t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<0> &t) {}
-};
-/// @endcond
-
-std::vector<int64_t> vectorize(const DDim &ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  DDim::ApplyVistor(visitor, ddim);
-  return result;
-}
-
-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> vectorize2int(const DDim &ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
-  return result;
-}
-
-struct ProductVisitor : Vistor<int64_t> {
-  template <int D> int64_t operator()(const Dim<D> &dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim &ddim) {
-  ProductVisitor visitor;
-  return DDim::ApplyVistor(visitor, ddim);
-}
-
-struct SliceVectorizeVisitor : Vistor<void> {
-  std::vector<int64_t> &vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    //    PADDLE_ENFORCE(begin < end,
-    //                   "Begin index must be less than end index in ddim
-    //                   slice.");
-    //    PADDLE_ENFORCE(begin >= 0,
-    //                   "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S> void operator()(const Dim<S> &dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<0> &dim) {
-    //    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
-  }
-};
-
-DDim slice_ddim(const DDim &ddim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  //  boost::apply_visitor(visitor, dim);
-  DDim::ApplyVistor(visitor, ddim);
-  //  visitor(ddim.var.Get<Dim<4>>());
-  return make_ddim(vec);
-}
-
-/// \cond HIDDEN
-
-struct ArityVisitor : Vistor<int> {
-  template <int D> int operator()(Dim<D>) const { return D; }
-};
-
-/// \endcond
-
-int arity(const DDim &d) {
-  ArityVisitor arityVisitor = ArityVisitor();
-  return DDim::ApplyVistor(arityVisitor, d);
-  //  return arityVisitor(d.var.Get<Dim<4>>());
-  //  return boost::apply_visitor(ArityVisitor(), d); }
-}
-/// \cond HIDDEN
-
-/// \endcond
-
-struct OSVistor : Vistor<std::ostream &> {
-  OSVistor(std::ostream &os) : os_(os) {}
-
-  template <int D> std::ostream &operator()(Dim<D> dim) const {
-    return os_ << dim;
-  }
-
-private:
-  std::ostream &os_;
-};
-
-std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
-  auto vistor = OSVistor(os);
-  DDim::ApplyVistor(vistor, ddim);
-  return os;
-}
-
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
-DDim flatten_to_2d(const DDim &src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
-}
-
-DDim flatten_to_1d(const DDim &src) { return make_ddim({product(src)}); }
-
-DDim stride(const DDim &ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return framework::make_ddim(strides);
-}
-
-DDim stride_numel(const framework::DDim &ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return framework::make_ddim(strides);
-}
-
-} // namespace framework
+    namespace framework {
+
+        /// @cond HIDDEN
+
+        template <int i> Dim<i> make_dim(const int64_t *d) {
+            return Dim<i>(*d, make_dim<i - 1>(d + 1));
+        }
+
+        template <> Dim<0> make_dim<0>(const int64_t *d) { return Dim<0>(*d); }
+
+        void make_ddim(DDim &ddim, const int64_t *dims, int n) {
+            switch (n) {
+            case 0:
+                ddim = make_dim<0>(dims);
+                break;
+            case 1:
+                ddim = make_dim<1>(dims);
+                break;
+            case 2:
+                ddim = make_dim<2>(dims);
+                break;
+            case 3:
+                ddim = make_dim<3>(dims);
+                break;
+            case 4:
+                ddim = make_dim<4>(dims);
+                break;
+            case 5:
+                ddim = make_dim<5>(dims);
+                break;
+            case 6:
+                ddim = make_dim<6>(dims);
+                break;
+            case 7:
+                ddim = make_dim<7>(dims);
+                break;
+            case 8:
+                ddim = make_dim<8>(dims);
+                break;
+            case 9:
+                ddim = make_dim<9>(dims);
+                break;
+            default:
+                //      std::cout << "Dynamic dimensions must have between [1,
+                //      9]
+                //      dimensions.";
+                break;
+            }
+        }
+
+        /// @endcond
+
+        DDim make_ddim(std::initializer_list<int64_t> dims) {
+            DDim result(make_dim(0));
+            make_ddim(result, dims.begin(), dims.size());
+            return result;
+        }
+
+        DDim make_ddim(const std::vector<int64_t> &dims) {
+            DDim result(make_dim(0));
+            make_ddim(result, &dims[0], dims.size());
+            return result;
+        }
+
+        DDim make_ddim(const std::vector<int> &dims) {
+            std::vector<int64_t> res(dims.size());
+            std::transform(dims.begin(), dims.end(), res.begin(),
+                           [](int d) { return static_cast<int64_t>(d); });
+            return make_ddim(res);
+        }
+
+        /// @cond HIDDEN
+        // XXX For some reason, putting this in an anonymous namespace causes
+        // errors
+        struct DynamicMutableIndexer : Vistor<int64_t &> {
+          public:
+            explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+
+            template <int D> int64_t &operator()(Dim<D> &dim) const {
+                return dim[idx_];
+            }
+
+          private:
+            int idx_;
+        };
+
+        struct DynamicConstIndexer : public Vistor<int64_t> {
+          public:
+            explicit DynamicConstIndexer(int idx) : idx_(idx) {}
+
+            template <int D> int64_t operator()(const Dim<D> &dim) const {
+                return dim[idx_];
+            }
+
+          private:
+            int idx_;
+        };
+
+        /// @endcond
+
+        int64_t &DDim::operator[](int idx) {
+            return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this);
+        }
+
+        int64_t DDim::operator[](int idx) const {
+            return DDim::ApplyVistor(DynamicConstIndexer(idx), *this);
+        }
+
+        int DDim::size() const { return arity(*this); }
+
+        bool DDim::operator==(DDim d) const {
+            //  if (var.which() != d.getVar().which()) {
+            //    return false;
+            //  } else {
+            std::vector<int64_t> v1 = vectorize(*this);
+            std::vector<int64_t> v2 = vectorize(d);
+
+            for (unsigned int i = 0; i < v1.size(); i++) {
+                if (v1[i] != v2[i]) {
+                    return false;
+                }
+            }
+
+            return true;
+            //  }
+        }
+
+        bool DDim::operator!=(DDim d) const { return !(*this == d); }
+
+        DDim DDim::operator+(DDim d) const {
+            std::vector<int64_t> v1 = vectorize(*this);
+            std::vector<int64_t> v2 = vectorize(d);
+
+            std::vector<int64_t> v3;
+
+            assert(v1.size() == v2.size());
+
+            for (unsigned int i = 0; i < v1.size(); i++) {
+                v3.push_back(v1[i] + v2[i]);
+            }
+
+            return make_ddim(v3);
+        }
+
+        DDim DDim::operator*(DDim d) const {
+            std::vector<int64_t> v1 = vectorize(*this);
+            std::vector<int64_t> v2 = vectorize(d);
+
+            std::vector<int64_t> v3;
+
+            assert(v1.size() == v2.size());
+
+            for (unsigned int i = 0; i < v1.size(); i++) {
+                v3.push_back(v1[i] * v2[i]);
+            }
+
+            return make_ddim(v3);
+        }
+
+        int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
+
+        void set(DDim &ddim, int idx, int value) { ddim[idx] = value; }
+
+        /// @cond HIDDEN
+        struct VectorizeVisitor : Vistor<void> {
+            std::vector<int64_t> &vector;
+
+            explicit VectorizeVisitor(std::vector<int64_t> &v) : vector(v) {}
+
+            template <typename T> void operator()(const T &t) {
+                vector.push_back(t.head);
+                this->operator()(t.tail);
+            }
+
+            void operator()(const Dim<0> &t) {}
+        };
+        /// @endcond
+
+        std::vector<int64_t> vectorize(const DDim &ddim) {
+            std::vector<int64_t> result;
+            VectorizeVisitor visitor(result);
+            DDim::ApplyVistor(visitor, ddim);
+            return result;
+        }
+
+        // NOTE: framework::vectorize converts to type int64_t
+        //       which does not fit cudnn inputs.
+        std::vector<int> vectorize2int(const DDim &ddim) {
+            std::vector<int64_t> temp = vectorize(ddim);
+            std::vector<int> result(temp.begin(), temp.end());
+            return result;
+        }
+
+        struct ProductVisitor : Vistor<int64_t> {
+            template <int D> int64_t operator()(const Dim<D> &dim) {
+                return product(dim);
+            }
+        };
+
+        int64_t product(const DDim &ddim) {
+            ProductVisitor visitor;
+            return DDim::ApplyVistor(visitor, ddim);
+        }
+
+        struct SliceVectorizeVisitor : Vistor<void> {
+            std::vector<int64_t> &vector;
+            int begin;
+            int end;
+
+            SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
+                : vector(v), begin(b), end(e) {
+                //    PADDLE_ENFORCE(begin < end,
+                //                   "Begin index must be less than end index in
+                //                   ddim
+                //                   slice.");
+                //    PADDLE_ENFORCE(begin >= 0,
+                //                   "Begin index can't be less than zero in
+                //                   ddim slice.");
+            }
+
+            template <int S> void operator()(const Dim<S> &dim) {
+                if (begin == 0) {
+                    vector.push_back(dim.head);
+                } else {
+                    --begin;
+                }
+                --end;
+                if (end > 0) {
+                    this->operator()(dim.tail);
+                }
+            }
+
+            void operator()(const Dim<0> &dim) {
+                //    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out
+                //    of bound.");
+            }
+        };
+
+        DDim slice_ddim(const DDim &ddim, int begin, int end) {
+            std::vector<int64_t> vec;
+            vec.reserve(end - begin);
+            SliceVectorizeVisitor visitor(vec, begin, end);
+            //  boost::apply_visitor(visitor, dim);
+            DDim::ApplyVistor(visitor, ddim);
+            //  visitor(ddim.var.Get<Dim<4>>());
+            return make_ddim(vec);
+        }
+
+        /// \cond HIDDEN
+
+        struct ArityVisitor : Vistor<int> {
+            template <int D> int operator()(Dim<D>) const { return D; }
+        };
+
+        /// \endcond
+
+        int arity(const DDim &d) {
+            ArityVisitor arityVisitor = ArityVisitor();
+            return DDim::ApplyVistor(arityVisitor, d);
+            //  return arityVisitor(d.var.Get<Dim<4>>());
+            //  return boost::apply_visitor(ArityVisitor(), d); }
+        }
+        /// \cond HIDDEN
+
+        /// \endcond
+
+        struct OSVistor : Vistor<std::ostream &> {
+            OSVistor(std::ostream &os) : os_(os) {}
+
+            template <int D> std::ostream &operator()(Dim<D> dim) const {
+                return os_ << dim;
+            }
+
+          private:
+            std::ostream &os_;
+        };
+
+        std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
+            auto vistor = OSVistor(os);
+            DDim::ApplyVistor(vistor, ddim);
+            return os;
+        }
+
+        DDim::DDim(std::initializer_list<int64_t> init_list) {
+            *this = make_ddim(init_list);
+        }
+
+        DDim flatten_to_2d(const DDim &src, int num_col_dims) {
+            int rank = src.size();
+            return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+                              product(slice_ddim(src, num_col_dims, rank))});
+        }
+
+        DDim flatten_to_1d(const DDim &src) {
+            return make_ddim({product(src)});
+        }
+
+        DDim stride(const DDim &ddim) {
+            std::vector<int64_t> strides(ddim.size());
+            strides[ddim.size() - 1] = 1;
+            for (int i = ddim.size() - 2; i >= 0; --i) {
+                strides[i] = strides[i + 1] * ddim[i + 1];
+            }
+            return framework::make_ddim(strides);
+        }
+
+        DDim stride_numel(const framework::DDim &ddim) {
+            std::vector<int64_t> strides(ddim.size());
+            strides[ddim.size() - 1] = ddim[ddim.size() - 1];
+            for (int i = ddim.size() - 2; i >= 0; --i) {
+                strides[i] = strides[i + 1] * ddim[i];
+            }
+            return framework::make_ddim(strides);
+        }
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -22,140 +22,145 @@ limitations under the License. */
 #include <vector>

 namespace paddle_mobile {
-namespace framework {
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-struct DDim {
-  typedef Variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                  Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
-
-  template <typename Vistor>
-  static typename Vistor::type_t ApplyVistor(Vistor vistor, const DDim &d) {
-    if (d.var.TypeId() == typeid(Dim<0>).hash_code()) {
-      return vistor(d.var.Get<Dim<0>>());
-    } else if (d.var.TypeId() == typeid(Dim<1>).hash_code()) {
-      return vistor(d.var.Get<Dim<1>>());
-    } else if (d.var.TypeId() == typeid(Dim<2>).hash_code()) {
-      return vistor(d.var.Get<Dim<2>>());
-    } else if (d.var.TypeId() == typeid(Dim<3>).hash_code()) {
-      return vistor(d.var.Get<Dim<3>>());
-    } else if (d.var.TypeId() == typeid(Dim<4>).hash_code()) {
-      return vistor(d.var.Get<Dim<4>>());
-    } else if (d.var.TypeId() == typeid(Dim<5>).hash_code()) {
-      return vistor(d.var.Get<Dim<5>>());
-    } else if (d.var.TypeId() == typeid(Dim<6>).hash_code()) {
-      return vistor(d.var.Get<Dim<6>>());
-    } else if (d.var.TypeId() == typeid(Dim<7>).hash_code()) {
-      return vistor(d.var.Get<Dim<7>>());
-    } else if (d.var.TypeId() == typeid(Dim<8>).hash_code()) {
-      return vistor(d.var.Get<Dim<8>>());
-    } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
-      return vistor(d.var.Get<Dim<9>>());
-    } else {
-      printf(" dim not support  \n");
-      throw std::bad_exception();
-      //        return typename Vistor::type_t();
-    }
-  }
-
-  DDim() { var.Set<Dim<1>>(Dim<1>()); }
-
-  template <int D> explicit DDim(const Dim<D> &in) { var.Set<Dim<D>>(in); }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
-
-  template <int D> DDim &operator=(const Dim<D> &in) {
-    var.Set<Dim<D>>(in);
-    return *this;
-  }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  //  template <typename Visitor>
-  //  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-  //    return var.apply_visitor(visitor);
-  //  }
-  //
-  //  template <typename Visitor>
-  //  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-  //    return var.apply_visitor(visitor);
-  //  }
-
-  DDimVar getVar() { return var; }
-
-  bool operator==(DDim d) const;
-
-  bool operator!=(DDim d) const;
-
-  DDim operator+(DDim d) const;
-
-  DDim operator*(DDim d) const;
-
-  int size() const;
-};
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int64_t> &dims);
-
-DDim make_ddim(const std::vector<int> &dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int64_t> dims);
-
-int64_t get(const DDim &dim, int idx);
-
-void set(DDim &dim, int idx, int val);
-
-std::vector<int64_t> vectorize(const DDim &ddim);
-
-std::vector<int> vectorize2int(const DDim &ddim);
-
-int64_t product(const DDim &ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim &dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
+    namespace framework {
+
+        /**
+         * \brief A dynamically sized dimension.
+         *
+         * The number of dimensions must be between [1, 9].
+         */
+        struct DDim {
+            typedef Variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
+                            Dim<6>, Dim<7>, Dim<8>, Dim<9>>
+                DDimVar;
+            DDimVar var;
+
+            template <typename Vistor>
+            static typename Vistor::type_t ApplyVistor(Vistor vistor,
+                                                       const DDim &d) {
+                if (d.var.TypeId() == typeid(Dim<0>).hash_code()) {
+                    return vistor(d.var.Get<Dim<0>>());
+                } else if (d.var.TypeId() == typeid(Dim<1>).hash_code()) {
+                    return vistor(d.var.Get<Dim<1>>());
+                } else if (d.var.TypeId() == typeid(Dim<2>).hash_code()) {
+                    return vistor(d.var.Get<Dim<2>>());
+                } else if (d.var.TypeId() == typeid(Dim<3>).hash_code()) {
+                    return vistor(d.var.Get<Dim<3>>());
+                } else if (d.var.TypeId() == typeid(Dim<4>).hash_code()) {
+                    return vistor(d.var.Get<Dim<4>>());
+                } else if (d.var.TypeId() == typeid(Dim<5>).hash_code()) {
+                    return vistor(d.var.Get<Dim<5>>());
+                } else if (d.var.TypeId() == typeid(Dim<6>).hash_code()) {
+                    return vistor(d.var.Get<Dim<6>>());
+                } else if (d.var.TypeId() == typeid(Dim<7>).hash_code()) {
+                    return vistor(d.var.Get<Dim<7>>());
+                } else if (d.var.TypeId() == typeid(Dim<8>).hash_code()) {
+                    return vistor(d.var.Get<Dim<8>>());
+                } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
+                    return vistor(d.var.Get<Dim<9>>());
+                } else {
+                    printf(" dim not support  \n");
+                    throw std::bad_exception();
+                    //        return typename Vistor::type_t();
+                }
+            }
+
+            DDim() { var.Set<Dim<1>>(Dim<1>()); }
+
+            template <int D> explicit DDim(const Dim<D> &in) {
+                var.Set<Dim<D>>(in);
+            }
+
+            /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+
+            template <int D> DDim &operator=(const Dim<D> &in) {
+                var.Set<Dim<D>>(in);
+                return *this;
+            }
+
+            int64_t &operator[](int idx);
+
+            int64_t operator[](int idx) const;
+
+            //  template <typename Visitor>
+            //  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+            //    return var.apply_visitor(visitor);
+            //  }
+            //
+            //  template <typename Visitor>
+            //  typename Visitor::result_type apply_visitor(Visitor& visitor)
+            //  const {
+            //    return var.apply_visitor(visitor);
+            //  }
+
+            DDimVar getVar() { return var; }
+
+            bool operator==(DDim d) const;
+
+            bool operator!=(DDim d) const;
+
+            DDim operator+(DDim d) const;
+
+            DDim operator*(DDim d) const;
+
+            int size() const;
+        };
+
+        /**
+         * \brief Make a DDim from std::vector<int64_t>
+         *
+         * \param dims An vector of ints. Must be sized between [1, 9]
+         */
+        DDim make_ddim(const std::vector<int64_t> &dims);
+
+        DDim make_ddim(const std::vector<int> &dims);
+
+        /**
+         * \brief Make a DDim from an initializer list
+         *
+         * \param dims An initializer list of ints. Must be sized between [1, 9]
+         *
+         */
+        DDim make_ddim(std::initializer_list<int64_t> dims);
+
+        int64_t get(const DDim &dim, int idx);
+
+        void set(DDim &dim, int idx, int val);
+
+        std::vector<int64_t> vectorize(const DDim &ddim);
+
+        std::vector<int> vectorize2int(const DDim &ddim);
+
+        int64_t product(const DDim &ddim);
+
+        /**
+         * \brief Slice a ddim
+         *
+         * Slice dim with [begin, end).
+         * e.g.  DDim d = make_ddim({1,2,3,4,5});
+         *       slice_ddim(d, 1, 3); ====> {2,3}
+         */
+        DDim slice_ddim(const DDim &dim, int begin, int end);
+
+        /**
+         * \brief What is the length of this dimension?
+         *
+         * \param Dynamic dimension to inspect
+         */

-int arity(const DDim &ddim);
+        int arity(const DDim &ddim);

-std::ostream &operator<<(std::ostream &, const DDim &);
+        std::ostream &operator<<(std::ostream &, const DDim &);

-// Reshape a tensor to a matrix. The matrix's first dimension(column length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim &src, int num_col_dims);
+        // Reshape a tensor to a matrix. The matrix's first dimension(column
+        // length)
+        // will be the product of tensor's first `num_col_dims` dimensions.
+        DDim flatten_to_2d(const DDim &src, int num_col_dims);

-DDim flatten_to_1d(const DDim &src);
+        DDim flatten_to_1d(const DDim &src);

-DDim stride(const DDim &ddim);
+        DDim stride(const DDim &ddim);

-DDim stride_numel(const DDim &ddim);
-} // namespace framework
+        DDim stride_numel(const DDim &ddim);
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -21,388 +21,410 @@
 #include "platform/hostdevice.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-// Statically sized, statically indexed dimension
-template <int i> struct Dim {
-  static constexpr int dimensions = i;
+        // Statically sized, statically indexed dimension
+        template <int i> struct Dim {
+            static constexpr int dimensions = i;

-  template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
+            template <typename... Args>
+            HOSTDEVICE Dim(int64_t _head, Args... _tail)
+                : head(_head), tail(_tail...) {
+                static_assert(
+                    sizeof...(_tail) == i - 1,
+                    "Dim initialized with the wrong number of parameters");
+            }

-  HOSTDEVICE
-  Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}
+            HOSTDEVICE
+            Dim(int64_t _head, const Dim<i - 1> &_tail)
+                : head(_head), tail(_tail) {}

-  HOSTDEVICE
-  Dim() : head(0), tail() {}
+            HOSTDEVICE
+            Dim() : head(0), tail() {}

-  /** Construct a Dim from a linear index and size.  Uses Fortran order
-   * indexing. */
-  HOSTDEVICE
-  Dim(int64_t idx, const Dim<i> &size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+            /** Construct a Dim from a linear index and size.  Uses Fortran
+             * order
+             * indexing. */
+            HOSTDEVICE
+            Dim(int64_t idx, const Dim<i> &size)
+                : head(idx % size.head), tail(idx / size.head, size.tail) {}

-  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+            /** Construct a Dim with each dimension set to the given index */
+            HOSTDEVICE
+            Dim(int64_t idx) : head(idx), tail(idx) {}

-  HOSTDEVICE
-  bool operator==(const Dim<i> &o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
+            HOSTDEVICE
+            bool operator==(const Dim<i> &o) const {
+                return (head == o.head) && (tail == o.tail);
+            }

-  HOSTDEVICE
-  bool operator!=(const Dim<i> &o) const { return !(*this == o); }
+            HOSTDEVICE
+            bool operator!=(const Dim<i> &o) const { return !(*this == o); }

-  HOSTDEVICE
-  int64_t &operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
+            HOSTDEVICE
+            int64_t &operator[](int idx);
+            HOSTDEVICE
+            int64_t operator[](int idx) const;

-  HOST std::string to_string() const;
+            HOST std::string to_string() const;

-  int64_t head;
-  Dim<i - 1> tail;
-};
+            int64_t head;
+            Dim<i - 1> tail;
+        };

-// Base case specialization
-template <> struct Dim<0> {
-  static constexpr int dimensions = 0;
+        // Base case specialization
+        template <> struct Dim<0> {
+            static constexpr int dimensions = 0;

-  HOSTDEVICE
-  Dim(int64_t _head) {}
+            HOSTDEVICE
+            Dim(int64_t _head) {}

-  HOSTDEVICE
-  Dim() {}
+            HOSTDEVICE
+            Dim() {}

-  HOSTDEVICE
-  Dim(int idx, const Dim<0> &size) {
+            HOSTDEVICE
+            Dim(int idx, const Dim<0> &size) {
 #ifndef __CUDA_ARCH__
-    if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
-    }
+                if (idx > 0) {
+                    throw std::invalid_argument("Index out of range.");
+                }
 #else
-    PADDLE_ASSERT(idx == 0);
+                PADDLE_ASSERT(idx == 0);
 #endif
-  }
-
-  HOSTDEVICE
-  bool operator==(const Dim<0> &o) const { return true; }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<0> &o) const { return false; }
-
-  HOSTDEVICE
-  int64_t &operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i> struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D> HOSTDEVICE static int64_t impl(const D &d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-};
-
-// Eureka! We found the element!
-template <> struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D> HOSTDEVICE static int64_t impl(const D &d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D> HOSTDEVICE static int64_t &impl(D &d) { return d.head; }
-};
-
-template <int D> HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
+            }
+
+            HOSTDEVICE
+            bool operator==(const Dim<0> &o) const { return true; }
+
+            HOSTDEVICE
+            bool operator!=(const Dim<0> &o) const { return false; }
+
+            HOSTDEVICE
+            int64_t &operator[](int idx);
+            HOSTDEVICE
+            int64_t operator[](int idx) const;
+        };
+
+        namespace {
+
+            // Helper for accessing Dim classes
+            template <int i> struct DimGetter {
+                // Return a copy if Dim is const
+                template <typename D>
+                HOSTDEVICE static int64_t impl(const D &d) {
+                    return DimGetter<i - 1>::impl(d.tail);
+                }
+                // Return a reference if Dim is mutable
+                template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
+                    return DimGetter<i - 1>::impl(d.tail);
+                }
+            };
+
+            // Eureka! We found the element!
+            template <> struct DimGetter<0> {
+                // Return a copy if Dim is const
+                template <typename D>
+                HOSTDEVICE static int64_t impl(const D &d) {
+                    return d.head;
+                }
+                // Return a reference if Dim is mutable
+                template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
+                    return d.head;
+                }
+            };
+
+            template <int D> HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
+                if (idx < 0) {
+                    throw std::invalid_argument(
+                        "Tried to access a negative dimension");
+                }
 #else
-  PADDLE_ASSERT(idx >= 0);
+                PADDLE_ASSERT(idx >= 0);
 #endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
+                if (idx == 0) {
+                    return dim.head;
+                }
+                return indexer(dim.tail, idx - 1);
+            }

-template <> HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) {
+            template <> HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
+                throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(false);
+                PADDLE_ASSERT(false);
 #if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
+                // On CUDA versions previous to 8.0, only __shared__ variables
+                // could be declared as static in the device code.
+                int64_t head = 0;
 #else
-  static int64_t head = 0;
+                static int64_t head = 0;
 #endif
-  return head;
+                return head;
 #endif
-}
+            }

-template <int D> HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
+            template <int D>
+            HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
+                if (idx < 0) {
+                    throw std::invalid_argument(
+                        "Tried to access a negative dimension");
+                }
 #else
-  PADDLE_ASSERT(idx >= 0);
+                PADDLE_ASSERT(idx >= 0);
 #endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <> HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) {
+                if (idx == 0) {
+                    return dim.head;
+                }
+                return indexer(dim.tail, idx - 1);
+            }
+
+            template <>
+            HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
+                throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(false);
+                PADDLE_ASSERT(false);
 #if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
+                // On CUDA versions previous to 8.0, only __shared__ variables
+                // could be declared as static in the device code.
+                int64_t head = 0;
 #else
-  static int64_t head = 0;
+                static int64_t head = 0;
 #endif
-  return head;
+                return head;
 #endif
-}
-
-} // namespace
-// Static access to constant Dim
-template <int i, int l> HOSTDEVICE int64_t get(const Dim<l> &d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Static access to mutable Dim
-template <int i, int l> HOSTDEVICE int64_t &get(Dim<l> &d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Dynamic access to constant Dim
-template <int l> HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
-  //  std::cout << "l: " << l << std::endl;
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-template <int l> HOSTDEVICE int64_t &Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d,
-                                                               int i) {
-  return d[i];
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d,
-                                                                 int i) {
-  return d[i];
-}
-
-// Dot product of two dims
-template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
-  return 0;
-}
-
-// Product of a Dim
-template <int i> HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <> HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) {
-  return prod;
-}
-
-// Is 0 <= idx_i < size_i for all i?
-template <int i>
-HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
-
-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
-  return true;
-}
-
-/**
- * \brief Compute exclusive prefix-multiply of a Dim.
- */
-template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
-
-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <> HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
-  return Dim<0>();
-}
-///\endcond
-
-/**
- * Add two dimensions together
- */
-template <int i> HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
-  return Dim<0>();
-}
-
-template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
-  return dim_plus(lhs, rhs);
-}
-
-/**
- * Multiply two dimensions together
- */
-template <int i> HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
-  return Dim<0>();
-}
-
-template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
-  return dim_mult(lhs, rhs);
-}
-
-/**
- * \brief Normalize strides to ensure any dimension with extent 1
- * has stride 0.
- *
- * \param size Dim object containing the size of an array
- * \param stride Dim object containing stride of an array
- * \return Dim object the same size as \p size with normalized strides
- *
- */
-
-template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
-
-template <>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
-                                           const Dim<0> &stride) {
-  return Dim<0>();
-}
-
-///\endcond
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of params
- *
- */
-
-template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-// Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream &>::type
-operator<<(std::ostream &os, const Dim<i> &d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream &>::type
-operator<<(std::ostream &os, const Dim<i> &d) {
-  os << d.head;
-  return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
-  return os;
-}
-
-template <int i> HOST std::string Dim<i>::to_string() const {
-  std::stringstream stream;
-
-  stream << *this;
-
-  return stream.str();
-}
-
-template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
-  Dim<D> result;
-
-  for (int i = 0; i < D - 1; ++i) {
-    result[i] = linear_index % extents[i];
-    linear_index /= extents[i];
-  }
-
-  result[D - 1] = linear_index;
-
-  return result;
-}
-
-} // namespace framework
+            }
+
+        } // namespace
+        // Static access to constant Dim
+        template <int i, int l> HOSTDEVICE int64_t get(const Dim<l> &d) {
+            return DimGetter<i>::impl(d);
+        }
+
+        // Static access to mutable Dim
+        template <int i, int l> HOSTDEVICE int64_t &get(Dim<l> &d) {
+            return DimGetter<i>::impl(d);
+        }
+
+        // Dynamic access to constant Dim
+        template <int l> HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
+            //  std::cout << "l: " << l << std::endl;
+            return indexer(*this, i);
+        }
+
+        // Dynamic access to mutable Dim
+        template <int l> HOSTDEVICE int64_t &Dim<l>::operator[](int i) {
+            return indexer(*this, i);
+        }
+
+        // Dynamic access to constant Dim
+        inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
+            return indexer(*this, i);
+        }
+
+        // Dynamic access to mutable Dim
+        inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) {
+            return indexer(*this, i);
+        }
+
+        // Dynamic access to constant Dim
+        // without std::enable_if will try to instantiate this on get<0>(d)
+        template <int l>
+        HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type
+        get(const Dim<l> &d, int i) {
+            return d[i];
+        }
+
+        // Dynamic access to mutable Dim
+        template <int l>
+        HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type
+        get(Dim<l> &d, int i) {
+            return d[i];
+        }
+
+        // Dot product of two dims
+        template <int i>
+        HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
+            return a.head * b.head + linearize(a.tail, b.tail);
+        }
+
+        // Base case dot product of two Dims
+        // Notice it is inline because it is no longer a template
+        template <>
+        HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
+            return 0;
+        }
+
+        // Product of a Dim
+        template <int i>
+        HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) {
+            return prod * a.head * product(a.tail);
+        }
+
+        // Base case product of a Dim
+        // Notice it is inline because it is no longer a template
+        template <>
+        HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) {
+            return prod;
+        }
+
+        // Is 0 <= idx_i < size_i for all i?
+        template <int i>
+        HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
+            return ((0 <= idx.head) && (idx.head < size.head) &&
+                    contained(idx.tail, size.tail));
+        }
+
+        // Base case of is 0 <= idx_i < size_i ?
+        // Notice it is inline because it is no longer a template
+        template <>
+        HOSTDEVICE inline bool contained(const Dim<0> &idx,
+                                         const Dim<0> &size) {
+            return true;
+        }
+
+        /**
+         * \brief Compute exclusive prefix-multiply of a Dim.
+         */
+        template <int i>
+        HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
+            return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
+        }
+
+        ///\cond HIDDEN
+        // Base case of ex_prefix_mul
+        // Notice it is inline because it is no longer a template
+        template <>
+        HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
+            return Dim<0>();
+        }
+        ///\endcond
+
+        /**
+         * Add two dimensions together
+         */
+        template <int i>
+        HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
+            return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
+        }
+
+        // Base case
+        template <>
+        HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
+            return Dim<0>();
+        }
+
+        template <int i>
+        HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
+            return dim_plus(lhs, rhs);
+        }
+
+        /**
+         * Multiply two dimensions together
+         */
+        template <int i>
+        HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
+            return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
+        }
+
+        // Base case
+        template <>
+        HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
+            return Dim<0>();
+        }
+
+        template <int i>
+        HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
+            return dim_mult(lhs, rhs);
+        }
+
+        /**
+         * \brief Normalize strides to ensure any dimension with extent 1
+         * has stride 0.
+         *
+         * \param size Dim object containing the size of an array
+         * \param stride Dim object containing stride of an array
+         * \return Dim object the same size as \p size with normalized strides
+         *
+         */
+
+        template <int i>
+        HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size,
+                                            const Dim<i> &stride) {
+            int norm_stride = size.head == 1 ? 0 : stride.head;
+            return Dim<i>(norm_stride,
+                          normalize_strides(size.tail, stride.tail));
+        }
+
+        ///\cond HIDDEN
+
+        template <>
+        HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
+                                                   const Dim<0> &stride) {
+            return Dim<0>();
+        }
+
+        ///\endcond
+
+        /**
+         * Helper function to create a Dim
+         *
+         * \param idxes The type of Dim constructed depends on the number of
+         * params
+         *
+         */
+
+        template <typename... Args>
+        HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+            return Dim<sizeof...(Args)>(idxes...);
+        }
+
+        // Allows us to output a Dim
+        // XXX For some reason, overloading fails to resolve this correctly
+        template <int i>
+        typename std::enable_if<(i > 1), std::ostream &>::type
+        operator<<(std::ostream &os, const Dim<i> &d) {
+            os << d.head << ", " << d.tail;
+            return os;
+        }
+
+        // Base case that allows us to output a Dim
+        // XXX I wish this could be an overload instead of a template
+        template <int i>
+        typename std::enable_if<(i == 1), std::ostream &>::type
+        operator<<(std::ostream &os, const Dim<i> &d) {
+            os << d.head;
+            return os;
+        }
+
+        inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
+            return os;
+        }
+
+        template <int i> HOST std::string Dim<i>::to_string() const {
+            std::stringstream stream;
+
+            stream << *this;
+
+            return stream.str();
+        }
+
+        template <int D>
+        HOSTDEVICE Dim<D> linear_to_dimension(int linear_index,
+                                              Dim<D> extents) {
+            Dim<D> result;
+
+            for (int i = 0; i < D - 1; ++i) {
+                result[i] = linear_index % extents[i];
+                linear_index /= extents[i];
+            }
+
+            result[D - 1] = linear_index;
+
+            return result;
+        }
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -23,72 +23,75 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-template <typename Dtype>
-Executor<Dtype>::Executor(const Program<Dtype> p) : program_(p) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
+        template <typename Dtype>
+        Executor<Dtype>::Executor(const Program<Dtype> p) : program_(p) {
+            if (use_optimize_) {
+                to_predict_program_ = program_.optimizeProgram;
+            } else {
+                to_predict_program_ = program_.originProgram;
+            }

-  const std::vector<std::shared_ptr<BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<OpDesc> op = ops[j];
-      if (op->Type() == "conv2d" && op->Input("Input")[0] == "pixel") {
-        Attribute strides_attr = op->GetAttrMap().at("strides");
-        std::vector<int> stride = strides_attr.Get<std::vector<int>>();
-        for (int k = 0; k < stride.size(); ++k) {
+            const std::vector<std::shared_ptr<BlockDesc>> blocks =
+                to_predict_program_->Blocks();
+            for (int i = 0; i < blocks.size(); ++i) {
+                std::shared_ptr<BlockDesc> block_desc = blocks[i];
+                std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+                for (int j = 0; j < ops.size(); ++j) {
+                    std::shared_ptr<OpDesc> op = ops[j];
+                    if (op->Type() == "conv2d" &&
+                        op->Input("Input")[0] == "pixel") {
+                        Attribute strides_attr = op->GetAttrMap().at("strides");
+                        std::vector<int> stride =
+                            strides_attr.Get<std::vector<int>>();
+                        for (int k = 0; k < stride.size(); ++k) {
+                        }
+                        std::shared_ptr<operators::ConvOp<Dtype, float>> conv =
+                            std::make_shared<operators::ConvOp<Dtype, float>>(
+                                op->Type(), op->GetInputs(), op->GetOutputs(),
+                                op->GetAttrMap(), program_.scope);
+                        ops_of_block_[*block_desc.get()].push_back(conv);
+                    }
+                }
+            }
        }

-        std::shared_ptr<operators::ConvOp<Dtype, float>> conv =
-            std::make_shared<operators::ConvOp<Dtype, float>>(
-                op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-                program_.scope);
-        ops_of_block_[*block_desc.get()].push_back(conv);
-      }
-    }
-  }
-}
+        template <typename Dtype>
+        std::shared_ptr<Tensor> Executor<Dtype>::predict(Tensor &t) {
+            // feed
+            auto scope = program_.scope;
+            Variable *g_feed_value = scope->Var("pixel");
+            auto tensor = g_feed_value->GetMutable<Tensor>();
+            tensor->ShareDataWith(t);

-template <typename Dtype>
-std::shared_ptr<Tensor> Executor<Dtype>::predict(Tensor &t) {
-  // feed
-  auto scope = program_.scope;
-  Variable *g_feed_value = scope->Var("pixel");
-  auto tensor = g_feed_value->GetMutable<Tensor>();
-  tensor->ShareDataWith(t);
+            Variable *con_output = scope->Var("conv2d_0.tmp_0");
+            Tensor *output_tensor = con_output->GetMutable<Tensor>();
+            output_tensor->mutable_data<float>({1, 16, 32, 32});
+            //  std::cout << typeid(output_tensor).name() << std::endl;
+            //  std::cout << "output_tensor dims: " << output_tensor->dims() <<
+            //  std::endl;

-  Variable *con_output = scope->Var("conv2d_0.tmp_0");
-  Tensor *output_tensor = con_output->GetMutable<Tensor>();
-  output_tensor->mutable_data<float>({1, 16, 32, 32});
-  //  std::cout << typeid(output_tensor).name() << std::endl;
-  //  std::cout << "output_tensor dims: " << output_tensor->dims() << std::endl;
+            std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+            out_tensor.reset(output_tensor);

-  std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-  out_tensor.reset(output_tensor);
-
-  predict(t, 0);
-  return out_tensor;
-}
+            predict(t, 0);
+            return out_tensor;
+        }

-template <typename Dtype>
-void Executor<Dtype>::predict(const Tensor &t, int block_id) {
-  std::shared_ptr<BlockDesc> to_predict_block =
-      to_predict_program_->Block(block_id);
-  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-    auto op = ops_of_block_[*to_predict_block.get()][j];
-    //    std::cout << "开始run" << std::endl;
-    op->Run();
-  }
-}
+        template <typename Dtype>
+        void Executor<Dtype>::predict(const Tensor &t, int block_id) {
+            std::shared_ptr<BlockDesc> to_predict_block =
+                to_predict_program_->Block(block_id);
+            for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size();
+                 ++j) {
+                auto op = ops_of_block_[*to_predict_block.get()][j];
+                //    std::cout << "开始run" << std::endl;
+                op->Run();
+            }
+        }

-template class Executor<CPU>;
+        template class Executor<CPU>;

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -32,22 +32,22 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-template <typename Dtype> class Executor {
-public:
-  Executor(const Program<Dtype> p);
-  std::shared_ptr<Tensor> predict(Tensor &t);
+        template <typename Dtype> class Executor {
+          public:
+            Executor(const Program<Dtype> p);
+            std::shared_ptr<Tensor> predict(Tensor &t);

-private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  void predict(const Tensor &t, int block_id);
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-};
+          private:
+            const framework::Program<Dtype> program_;
+            std::shared_ptr<ProgramDesc> to_predict_program_;
+            void predict(const Tensor &t, int block_id);
+            std::map<framework::BlockDesc,
+                     std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+                ops_of_block_;
+            bool use_optimize_ = false;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/framework.pb.cpp
+++ b/src/framework/framework.pb.cpp
--- a/src/framework/framework.pb.h
+++ b/src/framework/framework.pb.h
--- a/src/framework/lod_tensor.cc
+++ b/src/framework/lod_tensor.cc
@@ -19,280 +19,304 @@ limitations under the License. */
 #include <string.h>

 namespace paddle_mobile {
-namespace framework {
-
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
-  for (auto &v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto &i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  //  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
-
-  //  if (!platform::is_cpu_place(t.place())) {
-  //    LoDTensor tt;
-  //    framework::TensorCopy(t, platform::CPUPlace(), &tt);
-  //    platform::DeviceContextPool &pool =
-  //    platform::DeviceContextPool::Instance(); auto &dev_ctx =
-  //    *pool.Get(t.place()); dev_ctx.Wait();
-  //
-  //    os << tt;
-  //    return os;
-  //  }
-
-  os << "dim: " << t.dims() << "\n";
-  os << "lod: " << t.lod() << "\n";
-
-  // only print first ten elements
-  int64_t size = t.numel() < 10 ? t.numel() : 10;
-  for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
-  }
-
-  return os;
-}
-
-std::string LoDToString(const LoD &lod) {
-  std::ostringstream stream;
-  stream << lod;
-  return stream.str();
-}
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end) {
-  //  PADDLE_ENFORCE_LT(level, in.size());
-  //  PADDLE_ENFORCE_LT(elem_end, in[level].size());
-
-  LoD res;
-  res.resize(in.size() - level);
-  // copy the first level
-  res[0].assign(in[level].begin() + elem_begin,
-                in[level].begin() + elem_end + 1);
-  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto &in_level = in[level + lvl];
-    const auto &above_level = res[lvl - 1];
-    auto &out_level = res[lvl];
-    out_level.assign(in_level.begin() + above_level.front(),
-                     in_level.begin() + above_level.back() + 1);
-  }
-  for (size_t lvl = 0; lvl < res.size(); lvl++) {
-    // to make the first offset equals 0, all the elements minus the first
-    // element
-    size_t front = res[lvl].front();
-    for (auto &ele : res[lvl]) {
-      ele -= front;
-    }
-  }
-  return res;
-}
-
-LoD ToAbsOffset(const LoD &in) {
-  // the lowest level stores relative offsets
-  if (in.empty() || in.size() == 1)
-    return in;
-  LoD result = in;
-  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
-    for (size_t i = 0; i < in[level].size(); ++i) {
-      size_t index = in[level][i];
-      result[level][i] = result[level + 1][index];
-    }
-  }
-  return result;
-}
-
-bool operator==(const LoD &a, const LoD &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-
-  for (size_t i = 0; i < a.size(); i++) {
-    const auto &a_level = a[i];
-    const auto &b_level = b[i];
-    if (a_level.size() != b_level.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < a_level.size(); j++) {
-      if (a_level[j] != b_level[j]) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool CheckLoD(const LoD &in, int tensor_height) {
-  if (in.empty())
-    return true;
-  for (const auto &level : in) {
-    // check: there should be more than 2 offsets existing in each level.
-    if (level.size() < 2)
-      return false;
-    // check: the first offset(the begin offset) of each level should be 0.
-    if (level.front() != 0)
-      return false;
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b)
+    namespace framework {
+
+        std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+            os << "{";
+            for (auto &v : lod) {
+                os << "{";
+                bool is_first = true;
+                for (auto &i : v) {
+                    if (is_first) {
+                        os << i;
+                        is_first = false;
+                    } else {
+                        os << ", " << i;
+                    }
+                }
+                os << "}";
+            }
+            os << "}";
+
+            return os;
+        }
+
+        std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+            //  PADDLE_ENFORCE(t.type().hash_code() ==
+            //  typeid(float).hash_code());
+
+            //  if (!platform::is_cpu_place(t.place())) {
+            //    LoDTensor tt;
+            //    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+            //    platform::DeviceContextPool &pool =
+            //    platform::DeviceContextPool::Instance(); auto &dev_ctx =
+            //    *pool.Get(t.place()); dev_ctx.Wait();
+            //
+            //    os << tt;
+            //    return os;
+            //  }
+
+            os << "dim: " << t.dims() << "\n";
+            os << "lod: " << t.lod() << "\n";
+
+            // only print first ten elements
+            int64_t size = t.numel() < 10 ? t.numel() : 10;
+            for (int64_t i = 0; i < size; ++i) {
+                os << t.data<float>()[i] << " ";
+            }
+
+            return os;
+        }
+
+        std::string LoDToString(const LoD &lod) {
+            std::ostringstream stream;
+            stream << lod;
+            return stream.str();
+        }
+
+        LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
+                         size_t elem_end) {
+            //  PADDLE_ENFORCE_LT(level, in.size());
+            //  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+            LoD res;
+            res.resize(in.size() - level);
+            // copy the first level
+            res[0].assign(in[level].begin() + elem_begin,
+                          in[level].begin() + elem_end + 1);
+            for (size_t lvl = 1; lvl < res.size(); lvl++) {
+                const auto &in_level = in[level + lvl];
+                const auto &above_level = res[lvl - 1];
+                auto &out_level = res[lvl];
+                out_level.assign(in_level.begin() + above_level.front(),
+                                 in_level.begin() + above_level.back() + 1);
+            }
+            for (size_t lvl = 0; lvl < res.size(); lvl++) {
+                // to make the first offset equals 0, all the elements minus the
+                // first
+                // element
+                size_t front = res[lvl].front();
+                for (auto &ele : res[lvl]) {
+                    ele -= front;
+                }
+            }
+            return res;
+        }
+
+        LoD ToAbsOffset(const LoD &in) {
+            // the lowest level stores relative offsets
+            if (in.empty() || in.size() == 1)
+                return in;
+            LoD result = in;
+            for (auto level = static_cast<int>(in.size() - 2); level >= 0;
+                 level--) {
+                for (size_t i = 0; i < in[level].size(); ++i) {
+                    size_t index = in[level][i];
+                    result[level][i] = result[level + 1][index];
+                }
+            }
+            return result;
+        }
+
+        bool operator==(const LoD &a, const LoD &b) {
+            if (a.size() != b.size()) {
+                return false;
+            }
+
+            for (size_t i = 0; i < a.size(); i++) {
+                const auto &a_level = a[i];
+                const auto &b_level = b[i];
+                if (a_level.size() != b_level.size()) {
+                    return false;
+                }
+                for (size_t j = 0; j < a_level.size(); j++) {
+                    if (a_level[j] != b_level[j]) {
+                        return false;
+                    }
+                }
+            }
            return true;
-          return false;
-        })) {
-      std::cout << "ascending error";
-      return false;
-    }
-  }
-  // check: the lowest level's last offset should equals `tensor_height` if
-  //        tensor_height>0.
-  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
-    return false;
-
-  // check: the higher level's last offset should equals the lower level's
-  // size-1.
-  // NOTE LoD store the levels from top to bottom, so the higher level goes
-  // first.
-  for (size_t level = 0; level < in.size() - 1; level++) {
-    if (in[level].back() != in[level + 1].size() - 1)
-      return false;
-  }
-  return true;
-}
-
-bool CheckAbsLoD(const LoD &in, int tensor_height) {
-  if (in.empty())
-    return true;
-  for (const auto &level : in) {
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b)
+        }
+
+        bool CheckLoD(const LoD &in, int tensor_height) {
+            if (in.empty())
+                return true;
+            for (const auto &level : in) {
+                // check: there should be more than 2 offsets existing in each
+                // level.
+                if (level.size() < 2)
+                    return false;
+                // check: the first offset(the begin offset) of each level
+                // should be 0.
+                if (level.front() != 0)
+                    return false;
+                // check: all the offsets in a level should be ascending(no same
+                // items
+                // allows).
+                if (!std::is_sorted(level.begin(), level.begin(),
+                                    [](size_t a, size_t b) {
+                                        if (a < b)
+                                            return true;
+                                        return false;
+                                    })) {
+                    std::cout << "ascending error";
+                    return false;
+                }
+            }
+            // check: the lowest level's last offset should equals
+            // `tensor_height` if
+            //        tensor_height>0.
+            if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
+                return false;
+
+            // check: the higher level's last offset should equals the lower
+            // level's
+            // size-1.
+            // NOTE LoD store the levels from top to bottom, so the higher level
+            // goes
+            // first.
+            for (size_t level = 0; level < in.size() - 1; level++) {
+                if (in[level].back() != in[level + 1].size() - 1)
+                    return false;
+            }
            return true;
-          return false;
-        })) {
-      return false;
-    }
-
-    // check: there should be more than 2 offsets existing in each level.
-    if (level.size() < 2)
-      return false;
-
-    // check: the first offset of each level should be 0, and the last should be
-    // the same(the height of underlying tensor).
-    if (level.front() != 0)
-      return false;
-    if (tensor_height < 0) {
-      tensor_height = level.back();
-    } else if ((size_t)tensor_height != level.back()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
-                                        size_t end_idx, size_t start_level) {
-  LoD sub_lod;
-
-  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    //    PADDLE_ENFORCE_LE(start_idx, end_idx);
-    //    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
-    std::vector<size_t> level_lens;
-    for (size_t i = start_idx; i < end_idx; ++i) {
-      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
-    }
-    sub_lod.emplace_back(level_lens);
-    start_idx = lod[level_idx][start_idx];
-    end_idx = lod[level_idx][end_idx];
-  }
-
-  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
-}
-
-void AppendLoD(LoD *lod, const LoD &lod_length) {
-  //  PADDLE_ENFORCE(
-  //      lod->empty() || lod->size() == lod_length.size(),
-  //      "The lod_length should has the same size with the appended lod.");
-  if (lod->empty()) {
-    for (size_t i = 0; i < lod_length.size(); ++i) {
-      lod->emplace_back(1, 0); // size = 1, value = 0;
-    }
-    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-  }
-  for (size_t i = 0; i < lod->size(); ++i) {
-    auto &level = (*lod)[i];
-    for (size_t len : lod_length[i]) {
-      level.push_back(level.back() + len);
-    }
-  }
-}
-
-void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
-  { // the 1st field, uint32_t version for LoDTensor
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-  }
-  {
-    // the 2st field, LoD information
-    // uint64_t lod_level
-    // uint64_t lod_level_1 size in byte.
-    // int*     lod_level_1 data
-    // ...
-    auto lod = tensor.lod();
-    uint64_t size = lod.size();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-    for (auto &each : lod) {
-      size = each.size() * sizeof(framework::LoD::value_type::value_type);
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      os.write(reinterpret_cast<const char *>(each.data()),
-               static_cast<std::streamsize>(size));
-    }
-  }
-  // the 3st field, Tensor
-  TensorToStream(os, static_cast<Tensor>(tensor));
-}
-
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
-  {
-    // the 1st field, unit32_t version for LoDTensor
-    uint32_t version;
-    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-    //    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  }
-  {
-    // the 2st field, LoD information
-    uint64_t lod_level;
-    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-    auto &lod = *tensor->mutable_lod();
-    lod.resize(lod_level);
-    for (uint64_t i = 0; i < lod_level; ++i) {
-      uint64_t size;
-      is.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::vector<size_t> tmp(size / sizeof(size_t));
-      is.read(reinterpret_cast<char *>(tmp.data()),
-              static_cast<std::streamsize>(size));
-      lod[i] = tmp;
-    }
-  }
-  // the 3st filed, Tensor
-  TensorFromStream(is, static_cast<Tensor *>(tensor));
-}
-
-} // namespace framework
+        }
+
+        bool CheckAbsLoD(const LoD &in, int tensor_height) {
+            if (in.empty())
+                return true;
+            for (const auto &level : in) {
+                // check: all the offsets in a level should be ascending(no same
+                // items
+                // allows).
+                if (!std::is_sorted(level.begin(), level.begin(),
+                                    [](size_t a, size_t b) {
+                                        if (a < b)
+                                            return true;
+                                        return false;
+                                    })) {
+                    return false;
+                }
+
+                // check: there should be more than 2 offsets existing in each
+                // level.
+                if (level.size() < 2)
+                    return false;
+
+                // check: the first offset of each level should be 0, and the
+                // last should be
+                // the same(the height of underlying tensor).
+                if (level.front() != 0)
+                    return false;
+                if (tensor_height < 0) {
+                    tensor_height = level.back();
+                } else if ((size_t)tensor_height != level.back()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+
+        LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod,
+                                                size_t start_idx,
+                                                size_t end_idx,
+                                                size_t start_level) {
+            LoD sub_lod;
+
+            for (size_t level_idx = start_level; level_idx < lod.size();
+                 ++level_idx) {
+                //    PADDLE_ENFORCE_LE(start_idx, end_idx);
+                //    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+                std::vector<size_t> level_lens;
+                for (size_t i = start_idx; i < end_idx; ++i) {
+                    level_lens.push_back(lod[level_idx][i + 1] -
+                                         lod[level_idx][i]);
+                }
+                sub_lod.emplace_back(level_lens);
+                start_idx = lod[level_idx][start_idx];
+                end_idx = lod[level_idx][end_idx];
+            }
+
+            return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+        }
+
+        void AppendLoD(LoD *lod, const LoD &lod_length) {
+            //  PADDLE_ENFORCE(
+            //      lod->empty() || lod->size() == lod_length.size(),
+            //      "The lod_length should has the same size with the appended
+            //      lod.");
+            if (lod->empty()) {
+                for (size_t i = 0; i < lod_length.size(); ++i) {
+                    lod->emplace_back(1, 0); // size = 1, value = 0;
+                }
+                *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+            }
+            for (size_t i = 0; i < lod->size(); ++i) {
+                auto &level = (*lod)[i];
+                for (size_t len : lod_length[i]) {
+                    level.push_back(level.back() + len);
+                }
+            }
+        }
+
+        void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
+            { // the 1st field, uint32_t version for LoDTensor
+                constexpr uint32_t version = 0;
+                os.write(reinterpret_cast<const char *>(&version),
+                         sizeof(version));
+            }
+            {
+                // the 2st field, LoD information
+                // uint64_t lod_level
+                // uint64_t lod_level_1 size in byte.
+                // int*     lod_level_1 data
+                // ...
+                auto lod = tensor.lod();
+                uint64_t size = lod.size();
+                os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+                for (auto &each : lod) {
+                    size = each.size() *
+                           sizeof(framework::LoD::value_type::value_type);
+                    os.write(reinterpret_cast<const char *>(&size),
+                             sizeof(size));
+                    os.write(reinterpret_cast<const char *>(each.data()),
+                             static_cast<std::streamsize>(size));
+                }
+            }
+            // the 3st field, Tensor
+            TensorToStream(os, static_cast<Tensor>(tensor));
+        }
+
+        void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+            {
+                // the 1st field, unit32_t version for LoDTensor
+                uint32_t version;
+                is.read(reinterpret_cast<char *>(&version), sizeof(version));
+                //    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is
+                //    supported");
+            }
+            {
+                // the 2st field, LoD information
+                uint64_t lod_level;
+                is.read(reinterpret_cast<char *>(&lod_level),
+                        sizeof(lod_level));
+                auto &lod = *tensor->mutable_lod();
+                lod.resize(lod_level);
+                for (uint64_t i = 0; i < lod_level; ++i) {
+                    uint64_t size;
+                    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+                    std::vector<size_t> tmp(size / sizeof(size_t));
+                    is.read(reinterpret_cast<char *>(tmp.data()),
+                            static_cast<std::streamsize>(size));
+                    lod[i] = tmp;
+                }
+            }
+            // the 3st filed, Tensor
+            TensorFromStream(is, static_cast<Tensor *>(tensor));
+        }
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -23,178 +23,190 @@ limitations under the License. */

 namespace paddle_mobile {

-namespace framework {
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower level
- * - the first element should be 0 and that indicates that this sequence start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-using LoD = std::vector<std::vector<size_t>>;
-
-std::ostream &operator<<(std::ostream &os, const LoD &lod);
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t);
-
-std::string LoDToString(const LoD &lod);
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end);
-
-/*
- * Transform an LoD from relative offsets to absolute offsets.
- */
-LoD ToAbsOffset(const LoD &in);
-
-bool operator==(const LoD &a, const LoD &b);
-
-/*
- * Check whether this lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *
- *  1. all the offsets in a level should be ascending(no same items allows).
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the higher level's last offset should equals the lower level's size-1.
- *  4. the first offset(the begin offset) of each level should be 0.
- *  5. the lowest level's last offset should equals `tensor_height` if
- * tensor_height>0.
- */
-
-bool CheckLoD(const LoD &in, int tensor_height = -1);
-
-/*
- * Check whether this absolute lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items allows)
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the first offset of each level should be 0, and the last should be the
- *     same(the height of underlying tensor) or `tensor_height` if
- *     tensor_height>0.
- */
-bool CheckAbsLoD(const LoD &in, int tensor_height = -1);
-
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
-public:
-  LoDTensor() : Tensor() {}
-
-  explicit LoDTensor(const LoD &lod) : lod_(lod) {}
-
-  void set_lod(const LoD &lod) { lod_ = lod; }
-
-  const LoD &lod() const { return lod_; }
-
-  LoD *mutable_lod() { return &lod_; }
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
-    //    PADDLE_ENFORCE_LT(level, NumLevels());
-    //    PADDLE_ENFORCE_LT(elem, NumElements(level));
-    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
-  }
-
-  /*
-   * Number of LoDTensor's levels, each level has units of data, for example,
-   * in the sentence's view, article, paragraph, sentence are 3 levels.
-   */
-  size_t NumLevels() const { return lod_.size(); }
-
-  /*
-   * Number of elements in a level.
-   */
-  size_t NumElements(size_t level = 0) const {
-    //    PADDLE_ENFORCE_LT(level, NumLevels());
-    // the last offset is the end of last element
-    return (lod_)[level].size() - 1;
-  }
-
-private:
-  LoD lod_;
-};
-
-/*
- * Expand the `source` to fit the LoD of `lod`. For example, a `source`
- * LoDTensor is
- *  - LoD: [0, 2]
- *  - tensor: [a0, a1]
- * a `lod` is
- *  - LoD: [0 3 5]
- * returns a new LoDTensor
- *  - [a0 a0 a0 a1 a1]
- */
-template <typename T>
-LoDTensor LodExpand(const LoDTensor &source, const LoD &lod, size_t level) {
-  LoD abs_lod = ToAbsOffset(lod);
-  const auto &lod_level = lod[level];
-  size_t num_instances = source.dims()[0];
-
-  // new tensor
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  auto dims = source.dims();
-  dims[0] = lod_level.back();
-  tensor.Resize(dims);
-  tensor.mutable_data<T>();
-
-  //  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
-  for (size_t ins = 0; ins < num_instances; ins++) {
-    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      auto slice = tensor.Slice(elem, elem + 1);
-      TensorCopy(source.Slice(ins, ins + 1), &slice);
-    }
-  }
-  return tensor;
-}
-
-// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
-// relative length of details for every levels(i.e., [start_level: ]).
-//
-// For example,
-//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
-//   start_level = 0
-//   start_idx = 1
-//   end_idx = 3
-//
-// Returns:
-//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
-//  pair<size_t, size_t> = {11, 24}
-std::pair<LoD, std::pair<size_t, size_t>>
-GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, size_t end_idx,
-                           size_t start_level);
-
-void AppendLoD(LoD *lod, const LoD &lod_length);
-
-/*
- * Serialize/Desiralize LoDTensor to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
-
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
-
-} // namespace framework
+    namespace framework {
+
+        /*
+         * LoD is short for Level of Details.
+         *
+         * - in a level, each element indicates relative offset of the lower
+         * level
+         * - the first element should be 0 and that indicates that this sequence
+         * start
+         * from 0
+         * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+         *
+         * For example:
+         *    3-level LoD stores
+         *
+         *    0 2 3
+         *    0 2 4 7
+         *    0 2 5 7 10 12 15 20
+         */
+        using LoD = std::vector<std::vector<size_t>>;
+
+        std::ostream &operator<<(std::ostream &os, const LoD &lod);
+
+        std::ostream &operator<<(std::ostream &os, const LoDTensor &t);
+
+        std::string LoDToString(const LoD &lod);
+
+        LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
+                         size_t elem_end);
+
+        /*
+         * Transform an LoD from relative offsets to absolute offsets.
+         */
+        LoD ToAbsOffset(const LoD &in);
+
+        bool operator==(const LoD &a, const LoD &b);
+
+        /*
+         * Check whether this lod's format is valid.
+         *
+         * ATTENTION:
+         *   - Empty lod is treated as valid.
+         *
+         * It will check two things:
+         *
+         *  1. all the offsets in a level should be ascending(no same items
+         * allows).
+         *  2. there should be more than 2 offsets existing in each level.
+         *  3. the higher level's last offset should equals the lower level's
+         * size-1.
+         *  4. the first offset(the begin offset) of each level should be 0.
+         *  5. the lowest level's last offset should equals `tensor_height` if
+         * tensor_height>0.
+         */
+
+        bool CheckLoD(const LoD &in, int tensor_height = -1);
+
+        /*
+         * Check whether this absolute lod's format is valid.
+         *
+         * ATTENTION:
+         *   - Empty lod is treated as valid.
+         *
+         * It will check two things:
+         *  1. all the offsets in a level should be ascending(no same items
+         * allows)
+         *  2. there should be more than 2 offsets existing in each level.
+         *  3. the first offset of each level should be 0, and the last should
+         * be the
+         *     same(the height of underlying tensor) or `tensor_height` if
+         *     tensor_height>0.
+         */
+        bool CheckAbsLoD(const LoD &in, int tensor_height = -1);
+
+        /*
+         * LoDTensor (Level of details Tensor)
+         * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+         */
+        class LoDTensor : public Tensor {
+          public:
+            LoDTensor() : Tensor() {}
+
+            explicit LoDTensor(const LoD &lod) : lod_(lod) {}
+
+            void set_lod(const LoD &lod) { lod_ = lod; }
+
+            const LoD &lod() const { return lod_; }
+
+            LoD *mutable_lod() { return &lod_; }
+
+            /*
+             * Get the start offset and end offset of an  element from LoD.
+             */
+            std::pair<size_t, size_t> lod_element(size_t level,
+                                                  size_t elem) const {
+                //    PADDLE_ENFORCE_LT(level, NumLevels());
+                //    PADDLE_ENFORCE_LT(elem, NumElements(level));
+                return std::make_pair((lod_)[level][elem],
+                                      (lod_)[level][elem + 1]);
+            }
+
+            /*
+             * Number of LoDTensor's levels, each level has units of data, for
+             * example,
+             * in the sentence's view, article, paragraph, sentence are 3
+             * levels.
+             */
+            size_t NumLevels() const { return lod_.size(); }
+
+            /*
+             * Number of elements in a level.
+             */
+            size_t NumElements(size_t level = 0) const {
+                //    PADDLE_ENFORCE_LT(level, NumLevels());
+                // the last offset is the end of last element
+                return (lod_)[level].size() - 1;
+            }
+
+          private:
+            LoD lod_;
+        };
+
+        /*
+         * Expand the `source` to fit the LoD of `lod`. For example, a `source`
+         * LoDTensor is
+         *  - LoD: [0, 2]
+         *  - tensor: [a0, a1]
+         * a `lod` is
+         *  - LoD: [0 3 5]
+         * returns a new LoDTensor
+         *  - [a0 a0 a0 a1 a1]
+         */
+        template <typename T>
+        LoDTensor LodExpand(const LoDTensor &source, const LoD &lod,
+                            size_t level) {
+            LoD abs_lod = ToAbsOffset(lod);
+            const auto &lod_level = lod[level];
+            size_t num_instances = source.dims()[0];
+
+            // new tensor
+            LoDTensor tensor;
+            tensor.set_lod(lod);
+            auto dims = source.dims();
+            dims[0] = lod_level.back();
+            tensor.Resize(dims);
+            tensor.mutable_data<T>();
+
+            //  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
+            for (size_t ins = 0; ins < num_instances; ins++) {
+                for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1];
+                     elem++) {
+                    auto slice = tensor.Slice(elem, elem + 1);
+                    TensorCopy(source.Slice(ins, ins + 1), &slice);
+                }
+            }
+            return tensor;
+        }
+
+        // Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+        // relative length of details for every levels(i.e., [start_level: ]).
+        //
+        // For example,
+        //   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+        //   start_level = 0
+        //   start_idx = 1
+        //   end_idx = 3
+        //
+        // Returns:
+        //  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+        //  pair<size_t, size_t> = {11, 24}
+        std::pair<LoD, std::pair<size_t, size_t>>
+        GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
+                                   size_t end_idx, size_t start_level);
+
+        void AppendLoD(LoD *lod, const LoD &lod_length);
+
+        /*
+         * Serialize/Desiralize LoDTensor to std::ostream
+         * You can pass ofstream or ostringstream to serilize to file
+         * or to a in memory string. GPU tensor will be copied to CPU.
+         */
+        void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
+
+        void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_desc.cpp
+++ b/src/framework/op_desc.cpp
@@ -5,55 +5,58 @@
 #include "op_desc.h"

 namespace paddle_mobile {
-namespace framework {
-
-OpDesc::OpDesc(const proto::OpDesc &desc) : desc_(desc) {
-  for (int i = 0; i < desc_.inputs_size(); ++i) {
-    const proto::OpDesc::Var &var = desc_.inputs(i);
-    std::vector<std::string> &args = inputs_[var.parameter()];
-    int arg_size = var.arguments_size();
-    for (int j = 0; j < arg_size; ++j) {
-      args.push_back(var.arguments(j));
-    }
-  }
-
-  for (int i = 0; i < desc_.outputs_size(); ++i) {
-    const proto::OpDesc::Var &var = desc_.outputs(i);
-    std::vector<std::string> &args = outputs_[var.parameter()];
-    int arg_size = var.arguments_size();
-    for (int j = 0; j < arg_size; ++j) {
-      args.push_back(var.arguments(j));
-    }
-  }
-
-  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
-    std::string attr_name = attr.name();
-    if (attr.type() != proto::AttrType::BLOCK) {
-      attrs_[attr_name] = Attribute::GetAttrValue(attr);
-      //      if (attr.type() == proto::AttrType::INT){
-      //        std::cout << " attrName " << attr_name << " " <<
-      //        attrs_[attr_name].Get<int>() << std::endl;
-      //      }
-    }
-  }
-}
-
-const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
-  return inputs_.find(name)->second;
-}
-
-const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
-  return outputs_.find(name)->second;
-}
-
-Attribute OpDesc::GetAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  return it->second;
-}
-
-const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
-  return attrs_;
-}
-
-} // namespace framework
+    namespace framework {
+
+        OpDesc::OpDesc(const proto::OpDesc &desc) : desc_(desc) {
+            for (int i = 0; i < desc_.inputs_size(); ++i) {
+                const proto::OpDesc::Var &var = desc_.inputs(i);
+                std::vector<std::string> &args = inputs_[var.parameter()];
+                int arg_size = var.arguments_size();
+                for (int j = 0; j < arg_size; ++j) {
+                    args.push_back(var.arguments(j));
+                }
+            }
+
+            for (int i = 0; i < desc_.outputs_size(); ++i) {
+                const proto::OpDesc::Var &var = desc_.outputs(i);
+                std::vector<std::string> &args = outputs_[var.parameter()];
+                int arg_size = var.arguments_size();
+                for (int j = 0; j < arg_size; ++j) {
+                    args.push_back(var.arguments(j));
+                }
+            }
+
+            for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
+                std::string attr_name = attr.name();
+                if (attr.type() != proto::AttrType::BLOCK) {
+                    attrs_[attr_name] = Attribute::GetAttrValue(attr);
+                    //      if (attr.type() == proto::AttrType::INT){
+                    //        std::cout << " attrName " << attr_name << " " <<
+                    //        attrs_[attr_name].Get<int>() << std::endl;
+                    //      }
+                }
+            }
+        }
+
+        const std::vector<std::string> &
+        OpDesc::Input(const std::string &name) const {
+            return inputs_.find(name)->second;
+        }
+
+        const std::vector<std::string> &
+        OpDesc::Output(const std::string &name) const {
+            return outputs_.find(name)->second;
+        }
+
+        Attribute OpDesc::GetAttr(const std::string &name) const {
+            auto it = attrs_.find(name);
+            return it->second;
+        }
+
+        const std::unordered_map<std::string, Attribute> &
+        OpDesc::GetAttrMap() const {
+            return attrs_;
+        }
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_desc.h
+++ b/src/framework/op_desc.h
@@ -23,29 +23,31 @@ SOFTWARE.
 #include "paddle_mobile_object.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-class OpDesc : PaddleMobileObject {
-public:
-  OpDesc(const proto::OpDesc &desc);
-  const std::vector<std::string> &Input(const std::string &name) const;
-  const std::vector<std::string> &Output(const std::string &name) const;
-  Attribute GetAttr(const std::string &name) const;
+        class OpDesc : PaddleMobileObject {
+          public:
+            OpDesc(const proto::OpDesc &desc);
+            const std::vector<std::string> &
+            Input(const std::string &name) const;
+            const std::vector<std::string> &
+            Output(const std::string &name) const;
+            Attribute GetAttr(const std::string &name) const;

-  const VariableNameMap &GetInputs() { return inputs_; }
+            const VariableNameMap &GetInputs() { return inputs_; }

-  const VariableNameMap &GetOutputs() { return outputs_; }
+            const VariableNameMap &GetOutputs() { return outputs_; }

-  const AttributeMap &GetAttrMap() const;
+            const AttributeMap &GetAttrMap() const;

-  const std::string &Type() { return desc_.type(); };
+            const std::string &Type() { return desc_.type(); };

-private:
-  proto::OpDesc desc_;
-  VariableNameMap inputs_;
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-};
+          private:
+            proto::OpDesc desc_;
+            VariableNameMap inputs_;
+            VariableNameMap outputs_;
+            AttributeMap attrs_;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_info.h
+++ b/src/framework/op_info.h
@@ -22,70 +22,74 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype> struct OpInfo {
-  OpCreator<Dtype> creator_;
-  const OpCreator<Dtype> &Creator() const {
-    //    PADDLE_ENFORCE_NOT_NULL(creator_,
-    //                            "Operator Creator has not been registered");
-    return creator_;
-  }
-};
-
-template <typename Dtype> class OpInfoMap;
-
-template <typename Dtype> static OpInfoMap<Dtype> *g_op_info_map = nullptr;
-
-template <typename Dtype> class OpInfoMap {
-public:
-  static OpInfoMap &Instance() {
-    if (g_op_info_map<Dtype> == nullptr) {
-      g_op_info_map<Dtype> = new OpInfoMap();
-    }
-    return *g_op_info_map<Dtype>;
-  };
-
-  bool Has(const std::string &op_type) const {
-    return map_.find(op_type) != map_.end();
-  }
-
-  void Insert(const std::string &type, const OpInfo<Dtype> &info) {
-    //    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
-    map_.insert({type, info});
-  }
-
-  const OpInfo<Dtype> &Get(const std::string &type) const {
-    auto op_info_ptr = GetNullable(type);
-    //    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been
-    //    registered",
-    //                            type);
-    return *op_info_ptr;
-  }
-
-  const OpInfo<Dtype> *GetNullable(const std::string &type) const {
-    auto it = map_.find(type);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &it->second;
-    }
-  }
-
-  const std::unordered_map<std::string, OpInfo<Dtype>> &map() const {
-    return map_;
-  }
-
-  std::unordered_map<std::string, OpInfo<Dtype>> *mutable_map() {
-    return &map_;
-  }
-
-private:
-  OpInfoMap() = default;
-  std::unordered_map<std::string, OpInfo<Dtype>> map_;
-
-  //  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
-};
-
-} // namespace framework
+    namespace framework {
+
+        template <typename Dtype> struct OpInfo {
+            OpCreator<Dtype> creator_;
+            const OpCreator<Dtype> &Creator() const {
+                //    PADDLE_ENFORCE_NOT_NULL(creator_,
+                //                            "Operator Creator has not been
+                //                            registered");
+                return creator_;
+            }
+        };
+
+        template <typename Dtype> class OpInfoMap;
+
+        template <typename Dtype>
+        static OpInfoMap<Dtype> *g_op_info_map = nullptr;
+
+        template <typename Dtype> class OpInfoMap {
+          public:
+            static OpInfoMap &Instance() {
+                if (g_op_info_map<Dtype> == nullptr) {
+                    g_op_info_map<Dtype> = new OpInfoMap();
+                }
+                return *g_op_info_map<Dtype>;
+            };
+
+            bool Has(const std::string &op_type) const {
+                return map_.find(op_type) != map_.end();
+            }
+
+            void Insert(const std::string &type, const OpInfo<Dtype> &info) {
+                //    PADDLE_ENFORCE(!Has(type), "Operator %s has been
+                //    registered", type);
+                map_.insert({type, info});
+            }
+
+            const OpInfo<Dtype> &Get(const std::string &type) const {
+                auto op_info_ptr = GetNullable(type);
+                //    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not
+                //    been
+                //    registered",
+                //                            type);
+                return *op_info_ptr;
+            }
+
+            const OpInfo<Dtype> *GetNullable(const std::string &type) const {
+                auto it = map_.find(type);
+                if (it == map_.end()) {
+                    return nullptr;
+                } else {
+                    return &it->second;
+                }
+            }
+
+            const std::unordered_map<std::string, OpInfo<Dtype>> &map() const {
+                return map_;
+            }
+
+            std::unordered_map<std::string, OpInfo<Dtype>> *mutable_map() {
+                return &map_;
+            }
+
+          private:
+            OpInfoMap() = default;
+            std::unordered_map<std::string, OpInfo<Dtype>> map_;
+
+            //  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
+        };
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_kernel_type.h
+++ b/src/framework/op_kernel_type.h
@@ -22,43 +22,51 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-namespace framework {
-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType &key) const {
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
+    namespace framework {
+        struct OpKernelType {
+            struct Hash {
+                size_t operator()(const OpKernelType &key) const {
+                    int data_type = static_cast<int>(key.data_type_)
+                                    << LEFT_SHIFT;
+                    int data_layout = static_cast<int>(key.data_layout_)
+                                      << (LEFT_SHIFT * 2);

-      std::hash<int> hasher;
-      return hasher(data_type + data_layout);
-    }
-  };
+                    std::hash<int> hasher;
+                    return hasher(data_type + data_layout);
+                }
+            };

-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
+            // place, data_type, library_type kinds less than 2^8
+            constexpr static int LEFT_SHIFT = 8;

-  proto::VarType::Type data_type_;
-  DataLayout data_layout_;
+            proto::VarType::Type data_type_;
+            DataLayout data_layout_;

-  OpKernelType(proto::VarType::Type data_type,
-               DataLayout data_layout = DataLayout::kAnyLayout)
-      : data_type_(data_type), data_layout_(data_layout) {}
+            OpKernelType(proto::VarType::Type data_type,
+                         DataLayout data_layout = DataLayout::kAnyLayout)
+                : data_type_(data_type), data_layout_(data_layout) {}

-  bool operator==(const OpKernelType &o) const {
-    return data_type_ == o.data_type_ && data_layout_ == o.data_layout_;
-  }
+            bool operator==(const OpKernelType &o) const {
+                return data_type_ == o.data_type_ &&
+                       data_layout_ == o.data_layout_;
+            }

-  bool operator!=(const OpKernelType &o) const { return !(*this == o); }
-};
+            bool operator!=(const OpKernelType &o) const {
+                return !(*this == o);
+            }
+        };

-inline bool NeedTransformLayout(const DataLayout &l, const DataLayout &r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
-}
+        inline bool NeedTransformLayout(const DataLayout &l,
+                                        const DataLayout &r) {
+            return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout &&
+                   l != r;
+        }

-inline bool TransFromNeeded(const OpKernelType &l, const OpKernelType &r) {
-  return (l.data_type_ != r.data_type_) ||
-         NeedTransformLayout(l.data_layout_, r.data_layout_);
-}
+        inline bool TransFromNeeded(const OpKernelType &l,
+                                    const OpKernelType &r) {
+            return (l.data_type_ != r.data_type_) ||
+                   NeedTransformLayout(l.data_layout_, r.data_layout_);
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_proto_maker.h
+++ b/src/framework/op_proto_maker.h
@@ -19,8 +19,8 @@ SOFTWARE.
 #pragma once

 namespace paddle_mobile {
-namespace framework {
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {};
-} // namespace framework
+    namespace framework {
+        // this class not only make proto but also init attribute checkers.
+        class OpProtoAndCheckerMaker {};
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -20,26 +20,23 @@ SOFTWARE.
 #include "op_info.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-template <typename Dtype>
-OperatorBase<Dtype>::OperatorBase(const std::string &type,
-                                  const VariableNameMap &inputs,
-                                  const VariableNameMap &outputs,
-                                  const AttributeMap &attrs,
-                                  std::shared_ptr<Scope> scope)
-    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs),
-      scope_(scope) {
-  CheckAllInputOutputSet();
-}
+        template <typename Dtype>
+        OperatorBase<Dtype>::OperatorBase(const std::string &type,
+                                          const VariableNameMap &inputs,
+                                          const VariableNameMap &outputs,
+                                          const AttributeMap &attrs,
+                                          std::shared_ptr<Scope> scope)
+            : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs),
+              scope_(scope) {
+            CheckAllInputOutputSet();
+        }
+        template <typename Dtype>
+        void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}

-template <typename Dtype> void OperatorBase<Dtype>::Run() { RunImpl(); }
+        template class OperatorBase<CPU>;
+        template class OperatorWithKernel<CPU>;

-template <typename Dtype>
-void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
-
-template class OperatorBase<CPU>;
-template class OperatorWithKernel<CPU>;
-
-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -18,8 +18,6 @@ SOFTWARE.

 #pragma once

-#include <map>
-
 #include "attribute.h"
 #include "block_desc.h"
 #include "common/type_define.h"
@@ -31,62 +29,62 @@ SOFTWARE.
 #include "scope.h"
 #include "tensor.h"
 #include "variable.h"
+#include <map>

 namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype> class OperatorBase : PaddleMobileObject {
-public:
-  OperatorBase(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs, const AttributeMap &attrs,
-               std::shared_ptr<Scope> scope);
-  virtual ~OperatorBase() {}
-  virtual void Run();
-  const VariableNameMap &Inputs() const { return inputs_; }
-  const VariableNameMap &Outputs() const { return outputs_; }
-  const std::string &Type() const { return type_; }
-  const AttributeMap &Attrs() const { return attrs_; }
-
-protected:
-  std::shared_ptr<Scope> scope_;
-  std::string type_;
-  VariableNameMap inputs_;
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
+    namespace framework {

-private:
-  void CheckAllInputOutputSet() const;
-  virtual void RunImpl() const = 0;
-};
+        template <typename Dtype> class OperatorBase : PaddleMobileObject {
+          public:
+            OperatorBase(const std::string &type, const VariableNameMap &inputs,
+                         const VariableNameMap &outputs,
+                         const AttributeMap &attrs,
+                         std::shared_ptr<Scope> scope);
+            virtual ~OperatorBase() {}
+            virtual void Run() const = 0;

-template <typename Dtype>
-class OperatorWithKernel : public OperatorBase<Dtype> {
-public:
-  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     std::shared_ptr<Scope> scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
-  virtual void InferShape() const = 0;
+            const VariableNameMap &Inputs() const { return inputs_; }
+            const VariableNameMap &Outputs() const { return outputs_; }
+            const std::string &Type() const { return type_; }
+            const AttributeMap &Attrs() const { return attrs_; }
+            void ClearVariables() const {
+                if (this->scope_) {
+                    this->scope_->EraseVars(this->inputs_.at("Filter"));
+                    this->scope_->EraseVars(this->inputs_.at("Input"));
+                }
+            }

-  void ClearVariables() const {
-    if (this->scope_) {
-      this->scope_->EraseVars(this->inputs_.at("Filter"));
-      this->scope_->EraseVars(this->inputs_.at("Input"));
-    }
-  }
+          protected:
+            std::shared_ptr<Scope> scope_;
+            std::string type_;
+            VariableNameMap inputs_;
+            VariableNameMap outputs_;
+            AttributeMap attrs_;

-protected:
-  virtual void RunImpl() const = 0;
+          private:
+            void CheckAllInputOutputSet() const;
+        };

-private:
-};
+        template <typename Dtype>
+        class OperatorWithKernel : public OperatorBase<Dtype> {
+          public:
+            OperatorWithKernel(const std::string &type,
+                               const VariableNameMap &inputs,
+                               const VariableNameMap &outputs,
+                               const AttributeMap &attrs,
+                               std::shared_ptr<Scope> scope)
+                : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
+            virtual void InferShape() const = 0;
+            virtual void Run() const = 0;
+        };

-template <typename Dtype, typename P> class OpKernelBase : PaddleMobileObject {
-public:
-  virtual void Compute(const P &para) const = 0;
+        template <typename Dtype, typename P>
+        class OpKernelBase : PaddleMobileObject {
+          public:
+            virtual void Compute(const P &para) const = 0;

-  virtual ~OpKernelBase() = default;
-};
+            virtual ~OpKernelBase() = default;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/paddle_mobile_object.h
+++ b/src/framework/paddle_mobile_object.h
@@ -23,14 +23,14 @@ SOFTWARE.

 namespace paddle_mobile {

-class PaddleMobileObject {
-public:
-  virtual inline const std::string &ToString() {
-    char address[128] = {0};
-    sprintf(address, "%p", this);
-    return std::string(address);
-  }
+    class PaddleMobileObject {
+      public:
+        virtual inline const std::string &ToString() {
+            char address[128] = {0};
+            sprintf(address, "%p", this);
+            return std::string(address);
+        }

-private:
-};
+      private:
+    };
 } // namespace paddle_mobile
--- a/src/framework/program.cpp
+++ b/src/framework/program.cpp
@@ -17,5 +17,5 @@ SOFTWARE.
 ==============================================================================*/

 namespace paddle_mobile {
-namespace framework {}
+    namespace framework {}
 } // namespace paddle_mobile
--- a/src/framework/program.h
+++ b/src/framework/program.h
@@ -24,17 +24,17 @@ SOFTWARE.
 #include "scope.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-template <typename Dtype, Precision P = Precision::FP32>
-class Program : PaddleMobileObject {
-public:
-  std::shared_ptr<ProgramDesc> originProgram;
-  std::shared_ptr<ProgramDesc> optimizeProgram;
-  std::shared_ptr<Scope> scope;
+        template <typename Dtype, Precision P = Precision::FP32>
+        class Program : PaddleMobileObject {
+          public:
+            std::shared_ptr<ProgramDesc> originProgram;
+            std::shared_ptr<ProgramDesc> optimizeProgram;
+            std::shared_ptr<Scope> scope;

-private:
-};
+          private:
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/program_desc.cpp
+++ b/src/framework/program_desc.cpp
@@ -5,18 +5,18 @@
 #include "program_desc.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) : desc_(desc) {
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    // new framework::BlockDesc(block_desc)
-    blocks_.emplace_back(std::make_shared<BlockDesc>(block_desc));
-  }
-}
+        ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) : desc_(desc) {
+            for (auto &block_desc : *desc_.mutable_blocks()) {
+                // new framework::BlockDesc(block_desc)
+                blocks_.emplace_back(std::make_shared<BlockDesc>(block_desc));
+            }
+        }

-std::shared_ptr<BlockDesc> ProgramDesc::Block(size_t idx) {
-  return blocks_[idx];
-}
+        std::shared_ptr<BlockDesc> ProgramDesc::Block(size_t idx) {
+            return blocks_[idx];
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/program_desc.h
+++ b/src/framework/program_desc.h
@@ -25,18 +25,20 @@ SOFTWARE.
 #include "paddle_mobile_object.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-class ProgramDesc : PaddleMobileObject {
-public:
-  ProgramDesc(const proto::ProgramDesc &desc);
-  std::shared_ptr<BlockDesc> Block(size_t idx);
-  const std::vector<std::shared_ptr<BlockDesc>> &Blocks() { return blocks_; };
+        class ProgramDesc : PaddleMobileObject {
+          public:
+            ProgramDesc(const proto::ProgramDesc &desc);
+            std::shared_ptr<BlockDesc> Block(size_t idx);
+            const std::vector<std::shared_ptr<BlockDesc>> &Blocks() {
+                return blocks_;
+            };

-private:
-  std::vector<std::shared_ptr<BlockDesc>> blocks_;
-  proto::ProgramDesc desc_;
-};
+          private:
+            std::vector<std::shared_ptr<BlockDesc>> blocks_;
+            proto::ProgramDesc desc_;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/scope.cc
+++ b/src/framework/scope.cc
@@ -4,113 +4,116 @@
 #include <vector>

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-Scope &Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
-}
+        Scope &Scope::NewScope() const {
+            std::unique_lock<std::mutex> lock(mutex_);
+            kids_.push_back(new Scope(this));
+            return *kids_.back();
+        }

-Variable *Scope::Var(const std::string &name) {
-  auto *pvar = FindVarLocally(name);
-  if (pvar != nullptr) {
-    return pvar;
-  };
-  pvar = new Variable;
-  vars_[name] = pvar;
-  pvar->name_ = &(vars_.find(name)->first);
-  return pvar;
-}
+        Variable *Scope::Var(const std::string &name) {
+            auto *pvar = FindVarLocally(name);
+            if (pvar != nullptr) {
+                return pvar;
+            };
+            pvar = new Variable;
+            vars_[name] = pvar;
+            pvar->name_ = &(vars_.find(name)->first);
+            return pvar;
+        }

-//            Variable* Scope::Var(std::string* name) {
-//                auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-//                if (name != nullptr) {
-//                    *name = var_name;
-//                }
-//                return Var(var_name);
-//            }
+        //            Variable* Scope::Var(std::string* name) {
+        //                auto var_name = string::Sprintf("%p.%d", this,
+        //                vars_.size());
+        //                if (name != nullptr) {
+        //                    *name = var_name;
+        //                }
+        //                return Var(var_name);
+        //            }

-Variable *Scope::FindVar(const std::string &name) const {
-  auto *pvar = FindVarLocally(name);
-  if (pvar != nullptr) {
-    return pvar;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
-}
+        Variable *Scope::FindVar(const std::string &name) const {
+            auto *pvar = FindVarLocally(name);
+            if (pvar != nullptr) {
+                return pvar;
+            }
+            return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+        }

-const Scope *Scope::FindScope(const Variable *var) const {
-  for (auto &name_var : vars_) {
-    if (name_var.second == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
-}
+        const Scope *Scope::FindScope(const Variable *var) const {
+            for (auto &name_var : vars_) {
+                if (name_var.second == var) {
+                    return this;
+                }
+            }
+            return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+        }

-void Scope::DropKids() {
-  for (Scope *s : kids_) {
-    delete s;
-  }
-  kids_.clear();
-}
+        void Scope::DropKids() {
+            for (Scope *s : kids_) {
+                delete s;
+            }
+            kids_.clear();
+        }

-std::vector<std::string> Scope::LocalVarNames() const {
-  std::vector<std::string> known_vars;
-  known_vars.reserve(vars_.size());
-  for (auto &name_var : vars_) {
-    known_vars.emplace_back(name_var.first);
-  }
-  return known_vars;
-}
+        std::vector<std::string> Scope::LocalVarNames() const {
+            std::vector<std::string> known_vars;
+            known_vars.reserve(vars_.size());
+            for (auto &name_var : vars_) {
+                known_vars.emplace_back(name_var.first);
+            }
+            return known_vars;
+        }

-void Scope::DeleteScope(Scope *scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
-  auto it = std::find(kids_.begin(), kids_.end(), scope);
-  kids_.erase(it);
-  delete scope;
-  // deferent
-}
+        void Scope::DeleteScope(Scope *scope) const {
+            std::unique_lock<std::mutex> lock(mutex_);
+            auto it = std::find(kids_.begin(), kids_.end(), scope);
+            kids_.erase(it);
+            delete scope;
+            // deferent
+        }

-void Scope::EraseVars(const std::vector<std::string> &var_names) {
-  std::set<std::string> var_set(var_names.begin(), var_names.end());
-  for (auto it = vars_.begin(); it != vars_.end();) {
-    if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
-      it = vars_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
+        void Scope::EraseVars(const std::vector<std::string> &var_names) {
+            std::set<std::string> var_set(var_names.begin(), var_names.end());
+            for (auto it = vars_.begin(); it != vars_.end();) {
+                if (var_set.find(it->first) != var_set.end()) {
+                    delete it->second;
+                    it = vars_.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }

-void Scope::Rename(const std::string &origin_name,
-                   const std::string &new_name) const {
-  auto origin_it = vars_.find(origin_name);
-  if (origin_it == vars_.end()) {
-    return;
-  }
-  auto new_it = vars_.find(new_name);
-  if (new_it != vars_.end()) {
-    return;
-  }
-  vars_[new_name] = origin_it->second;
-  vars_.erase(origin_it);
-}
-//
-//            std::string Scope::Rename(const std::string& origin_name) const {
-//                auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-//                Rename(origin_name, var_name);
-//                return var_name;
-//            }
+        void Scope::Rename(const std::string &origin_name,
+                           const std::string &new_name) const {
+            auto origin_it = vars_.find(origin_name);
+            if (origin_it == vars_.end()) {
+                return;
+            }
+            auto new_it = vars_.find(new_name);
+            if (new_it != vars_.end()) {
+                return;
+            }
+            vars_[new_name] = origin_it->second;
+            vars_.erase(origin_it);
+        }
+        //
+        //            std::string Scope::Rename(const std::string& origin_name)
+        //            const {
+        //                auto var_name = string::Sprintf("%p.%d", this,
+        //                vars_.size());
+        //                Rename(origin_name, var_name);
+        //                return var_name;
+        //            }

-Variable *Scope::FindVarLocally(const std::string &name) const {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) {
-    return it->second;
-  }
-  return nullptr;
-}
+        Variable *Scope::FindVarLocally(const std::string &name) const {
+            auto it = vars_.find(name);
+            if (it != vars_.end()) {
+                return it->second;
+            }
+            return nullptr;
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -24,57 +24,58 @@ SOFTWARE.
 #include <unordered_map> //std::unordered_map

 namespace paddle_mobile {
-namespace framework {
-class Scope {
-public:
-  Scope() {}
-  ~Scope() {}
+    namespace framework {
+        class Scope {
+          public:
+            Scope() {}
+            ~Scope() {}

-  Scope &NewScope() const;
+            Scope &NewScope() const;

-  /// Create a variable with given name if it doesn't exist.
-  Variable *Var(const std::string &name);
+            /// Create a variable with given name if it doesn't exist.
+            Variable *Var(const std::string &name);

-  /// Create a variable with a scope-unique name.
-  Variable *Var(std::string *name = nullptr);
+            /// Create a variable with a scope-unique name.
+            Variable *Var(std::string *name = nullptr);

-  void EraseVars(const std::vector<std::string> &var_names);
+            void EraseVars(const std::vector<std::string> &var_names);

-  /// Find a variable in the scope or any of its ancestors.  Returns
-  /// nullptr if cannot find.
-  Variable *FindVar(const std::string &name) const;
+            /// Find a variable in the scope or any of its ancestors.  Returns
+            /// nullptr if cannot find.
+            Variable *FindVar(const std::string &name) const;

-  const Scope *parent() const { return parent_; }
+            const Scope *parent() const { return parent_; }

-  /// Find the scope or an ancestor scope that contains the given variable.
-  const Scope *FindScope(const Variable *var) const;
+            /// Find the scope or an ancestor scope that contains the given
+            /// variable.
+            const Scope *FindScope(const Variable *var) const;

-  void DeleteScope(Scope *scope) const;
+            void DeleteScope(Scope *scope) const;

-  /// Drop all kids scopes belonged to this scope.
-  void DropKids();
+            /// Drop all kids scopes belonged to this scope.
+            void DropKids();

-  // enumerate all the variables current contains.
-  std::vector<std::string> LocalVarNames() const;
+            // enumerate all the variables current contains.
+            std::vector<std::string> LocalVarNames() const;

-  // Rename variable to a new name
-  void Rename(const std::string &origin_name,
-              const std::string &new_name) const;
+            // Rename variable to a new name
+            void Rename(const std::string &origin_name,
+                        const std::string &new_name) const;

-  // Rename variable to a new name and return the new name
-  std::string Rename(const std::string &origin_name) const;
+            // Rename variable to a new name and return the new name
+            std::string Rename(const std::string &origin_name) const;

-  Variable *FindVarLocally(const std::string &name) const;
+            Variable *FindVarLocally(const std::string &name) const;

-private:
-  // Call Scope::NewScope for a sub-scope.
-  explicit Scope(Scope const *parent) : parent_(parent) {}
+          private:
+            // Call Scope::NewScope for a sub-scope.
+            explicit Scope(Scope const *parent) : parent_(parent) {}

-  mutable std::unordered_map<std::string, Variable *> vars_;
-  mutable std::list<Scope *> kids_;
-  Scope const *parent_{nullptr};
+            mutable std::unordered_map<std::string, Variable *> vars_;
+            mutable std::list<Scope *> kids_;
+            Scope const *parent_{nullptr};

-  mutable std::mutex mutex_;
-};
-} // namespace framework
+            mutable std::mutex mutex_;
+        };
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
@@ -24,57 +24,59 @@ SOFTWARE.
 #include "tensor.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-class SelectedRows {
-public:
-  SelectedRows(const std::vector<int64_t> &rows, const int64_t &height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-  }
+        class SelectedRows {
+          public:
+            SelectedRows(const std::vector<int64_t> &rows,
+                         const int64_t &height)
+                : rows_(rows), height_(height) {
+                value_.reset(new Tensor());
+            }

-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-  }
+            SelectedRows() {
+                height_ = 0;
+                value_.reset(new Tensor());
+            }

-  const Tensor &value() const { return *value_; }
+            const Tensor &value() const { return *value_; }

-  Tensor *mutable_value() { return value_.get(); }
+            Tensor *mutable_value() { return value_.get(); }

-  int64_t height() const { return height_; }
+            int64_t height() const { return height_; }

-  void set_height(int64_t height) { height_ = height; }
+            void set_height(int64_t height) { height_ = height; }

-  const std::vector<int64_t> &rows() const { return rows_; }
+            const std::vector<int64_t> &rows() const { return rows_; }

-  std::vector<int64_t> *mutable_rows() { return &rows_; }
+            std::vector<int64_t> *mutable_rows() { return &rows_; }

-  void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }
+            void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }

-  /**
-   * get the index of id in rows
-   */
-  int64_t index(int64_t id) const {
-    auto it = std::find(rows_.begin(), rows_.end(), id);
-    //    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
+            /**
+             * get the index of id in rows
+             */
+            int64_t index(int64_t id) const {
+                auto it = std::find(rows_.begin(), rows_.end(), id);
+                //    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+                return static_cast<int64_t>(std::distance(rows_.begin(), it));
+            }

-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
+            DDim GetCompleteDims() const {
+                std::vector<int64_t> dims = vectorize(value_->dims());
+                dims[0] = height_;
+                return make_ddim(dims);
+            }

-private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  std::vector<int64_t> rows_;
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
-};
+          private:
+            // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9}
+            // here.
+            // SelectedRows are simply concated when adding together. Until a
+            // SelectedRows add a Tensor, will the duplicate rows be handled.
+            std::vector<int64_t> rows_;
+            std::unique_ptr<Tensor> value_{nullptr};
+            int64_t height_;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -25,287 +25,316 @@ limitations under the License. */
 #include "memory/t_malloc.h"

 namespace paddle_mobile {
-namespace framework {
-template <typename... T> struct SizeOfTypeFunctor;
-
-template <typename T> struct SizeOfTypeFunctor<T> {
-  size_t operator()(std::type_index type) const {
-    if (typeid(T).hash_code() == type.hash_code()) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <> struct SizeOfTypeFunctor<> {
-  size_t operator()(std::type_index type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(std::type_index type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
-  size_t size = functor(type);
-  //  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
-  return size;
-}
-
-class LoDTensor;
-
-class Tensor {
-public:
-  Tensor() : offset_(0) {}
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T> inline T *data() {
-    check_memory_size();
-    //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-    //                     holder_->type().hash_code() == typeid(T).hash_code(),
-    //                 "Tensor holds the wrong type, it holds %s",
-    //                 this->holder_->type().name());
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T> inline const T *data() const {
-    check_memory_size();
-    //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-    //                     holder_->type().hash_code() == typeid(T).hash_code(),
-    //                 "Tensor holds the wrong type, it holds %s",
-    //                 this->holder_->type().name());
-
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T> inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(typeid(T)));
-  }
-
-  inline void *mutable_data(std::type_index type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    //  PADDLE_ENFORCE_GE(numel(), 0,
-    //                    "When calling this method, the Tensor's numel must be
-    //                    " "equal or larger than zero. " "Please check
-    //                    Tensor::Resize has been called first.");
-    int64_t size = numel() * SizeOfType(type);
-    /* some versions of boost::variant don't have operator!= */
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type));
-
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  inline void *mutable_data() {
-    //  PADDLE_ENFORCE(this->holder_ != nullptr,
-    //                 "Cannot invoke mutable data if current hold nothing.");
-    return mutable_data(holder_->type());
-  }
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T> inline T *mutable_data(DDim dims) {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  /*! Return the dimensions of the memory block. */
-  inline const DDim &dims() const { return dims_; }
-
-  /*! Return the numel of the memory block. */
-  inline int64_t numel() const { return product(dims_); }
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    *this = src;
-    return *this;
-  }
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to slice.
-   *                        The index number begins from 0.
-   */
-  inline Tensor Slice(int begin_idx, int end_idx) const {
-    check_memory_size();
-    //  PADDLE_ENFORCE_GE(begin_idx, 0,
-    //                    "The start row index must be greater than 0.");
-    //  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of
-    //  bound."); PADDLE_ENFORCE_LT(
-    //      begin_idx, end_idx,
-    //      "The start row index must be lesser than the end row index.");
-
-    if (dims_[0] == 1) {
-      return *this;
-    } else {
-      size_t base = numel() / dims_[0];
-      Tensor dst;
-      dst.holder_ = holder_;
-      dst.set_layout(layout_);
-      DDim dst_dims = dims_;
-      dst_dims[0] = end_idx - begin_idx;
-      dst.Resize(dst_dims);
-      dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-      return dst;
-    }
-  }
-
-  std::type_index type() const {
-    //                PADDLE_ENFORCE_NOT_NULL(
-    //                        holder_, "Tensor not initialized yet when
-    //                        Tensor::type() is called.");
-    return holder_->type();
-  }
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const {
-    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-  }
-
-  inline void check_memory_size() const {
-    //  PADDLE_ENFORCE_NOT_NULL(
-    //      holder_, "Tensor holds no memory. Call Tensor::mutable_data
-    //      first.");
-    //  PADDLE_ENFORCE_LE(
-    //      numel() * SizeOfType(type()), memory_size(),
-    //      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-    //      "first to re-allocate memory.\n"
-    //      "or maybe the required data-type mismatches the data already
-    //      stored.");
-  }
-
-  inline DataLayout layout() const { return layout_; }
-
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
-
-private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-
-    virtual void *ptr() const = 0;
-
-    virtual size_t size() const = 0;
-
-    virtual std::type_index type() const = 0;
-
-    virtual void set_type(std::type_index type) = 0;
-  };
-
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t size, std::type_index type)
-        : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
-               memory::PODDeleter<uint8_t>()),
-          size_(size), type_(type) {
-      //                    PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s
-      //                    memory to allocation.",
-      //                                            (is_cpu_place(place_) ?
-      //                                            "CPU" : "GPU"));
-    }
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
-
-    virtual std::type_index type() const { return type_; }
-
-    virtual void set_type(std::type_index type) { type_ = type; }
-
-    /*! the pointer of memory block. */
-    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    /* the current type of memory */
-    std::type_index type_;
-  };
-
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is stored
-   *       For example, in 4-D Tensor(rank=4), there are three commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height, the width.
-   */
-
-  DataLayout layout_ = DataLayout::kNHWC;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really begins.
-   */
-  size_t offset_;
-};
-
-inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-} // namespace framework
+    namespace framework {
+        template <typename... T> struct SizeOfTypeFunctor;
+
+        template <typename T> struct SizeOfTypeFunctor<T> {
+            size_t operator()(std::type_index type) const {
+                if (typeid(T).hash_code() == type.hash_code()) {
+                    return sizeof(T);
+                } else {
+                    return 0UL;
+                }
+            }
+        };
+
+        template <> struct SizeOfTypeFunctor<> {
+            size_t operator()(std::type_index type) const { return 0UL; }
+        };
+
+        template <typename HEAD, typename... TAIL>
+        struct SizeOfTypeFunctor<HEAD, TAIL...> {
+            size_t operator()(std::type_index type) const {
+                SizeOfTypeFunctor<HEAD> head;
+                size_t head_size = head(type);
+                if (head_size != 0) {
+                    return head_size;
+                }
+                SizeOfTypeFunctor<TAIL...> tail;
+                return tail(type);
+            }
+        };
+
+        static inline size_t SizeOfType(std::type_index type) {
+            SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool,
+                              size_t>
+                functor;
+            size_t size = functor(type);
+            //  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s",
+            //  type.name());
+            return size;
+        }
+
+        class LoDTensor;
+
+        class Tensor {
+          public:
+            Tensor() : offset_(0) {}
+
+            /*! Return a pointer to mutable memory block. */
+            template <typename T> inline T *data() {
+                check_memory_size();
+                //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                //                     holder_->type().hash_code() ==
+                //                     typeid(T).hash_code(),
+                //                 "Tensor holds the wrong type, it holds %s",
+                //                 this->holder_->type().name());
+                return reinterpret_cast<T *>(
+                    reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+            }
+
+            /*! Return a pointer to constant memory block. */
+            template <typename T> inline const T *data() const {
+                check_memory_size();
+                //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                //                     holder_->type().hash_code() ==
+                //                     typeid(T).hash_code(),
+                //                 "Tensor holds the wrong type, it holds %s",
+                //                 this->holder_->type().name());
+
+                return reinterpret_cast<const T *>(
+                    reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+            }
+
+            inline bool IsInitialized() const { return holder_ != nullptr; }
+
+            /**
+             * @brief   Return a pointer to mutable memory block.
+             * @note    If not exist, then allocation.
+             */
+            template <typename T> inline T *mutable_data() {
+                static_assert(std::is_pod<T>::value, "T must be POD");
+                return reinterpret_cast<T *>(mutable_data(typeid(T)));
+            }
+
+            inline void *mutable_data(std::type_index type) {
+                if (holder_ != nullptr) {
+                    holder_->set_type(type);
+                }
+                //  PADDLE_ENFORCE_GE(numel(), 0,
+                //                    "When calling this method, the Tensor's
+                //                    numel must be
+                //                    " "equal or larger than zero. " "Please
+                //                    check
+                //                    Tensor::Resize has been called first.");
+                int64_t size = numel() * SizeOfType(type);
+                /* some versions of boost::variant don't have operator!= */
+                if (holder_ == nullptr || holder_->size() < size + offset_) {
+                    holder_.reset(new PlaceholderImpl(size, type));
+
+                    offset_ = 0;
+                }
+                return reinterpret_cast<void *>(
+                    reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+            }
+
+            inline void *mutable_data() {
+                //  PADDLE_ENFORCE(this->holder_ != nullptr,
+                //                 "Cannot invoke mutable data if current hold
+                //                 nothing.");
+                return mutable_data(holder_->type());
+            }
+
+            /**
+             * @brief     Return a pointer to mutable memory block.
+             *
+             * @param[in] dims    The dimensions of the memory block.
+             * @param[in] place   The place of the memory block.
+             *
+             * @note      If not exist, then allocation.
+             */
+            template <typename T> inline T *mutable_data(DDim dims) {
+                static_assert(std::is_pod<T>::value, "T must be POD");
+                Resize(dims);
+                return mutable_data<T>();
+            }
+
+            /*! Return the dimensions of the memory block. */
+            inline const DDim &dims() const { return dims_; }
+
+            /*! Return the numel of the memory block. */
+            inline int64_t numel() const { return product(dims_); }
+
+            /*! Resize the dimensions of the memory block. */
+            inline Tensor &Resize(const DDim &dims) {
+                dims_ = dims;
+                return *this;
+            }
+
+            /*! The internal of two tensors share the same memory block. */
+            inline Tensor &ShareDataWith(const Tensor &src) {
+                src.check_memory_size();
+                *this = src;
+                return *this;
+            }
+
+            /**
+             * @brief  Return a sub-tensor of the given tensor.
+             *
+             * @param[in] begin_idx   The index of the start row(inclusive) to
+             * slice.
+             *                        The index number begins from 0.
+             * @param[in] end_idx     The index of the end row(exclusive) to
+             * slice.
+             *                        The index number begins from 0.
+             */
+            inline Tensor Slice(int begin_idx, int end_idx) const {
+                check_memory_size();
+                //  PADDLE_ENFORCE_GE(begin_idx, 0,
+                //                    "The start row index must be greater than
+                //                    0.");
+                //  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is
+                //  out of
+                //  bound."); PADDLE_ENFORCE_LT(
+                //      begin_idx, end_idx,
+                //      "The start row index must be lesser than the end row
+                //      index.");
+
+                if (dims_[0] == 1) {
+                    return *this;
+                } else {
+                    size_t base = numel() / dims_[0];
+                    Tensor dst;
+                    dst.holder_ = holder_;
+                    dst.set_layout(layout_);
+                    DDim dst_dims = dims_;
+                    dst_dims[0] = end_idx - begin_idx;
+                    dst.Resize(dst_dims);
+                    dst.offset_ =
+                        offset_ + begin_idx * base * SizeOfType(type());
+                    return dst;
+                }
+            }
+
+            std::type_index type() const {
+                //                PADDLE_ENFORCE_NOT_NULL(
+                //                        holder_, "Tensor not initialized yet
+                //                        when
+                //                        Tensor::type() is called.");
+                return holder_->type();
+            }
+
+            // memory size returns the holding memory size in byte.
+            size_t memory_size() const {
+                return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+            }
+
+            inline void check_memory_size() const {
+                //  PADDLE_ENFORCE_NOT_NULL(
+                //      holder_, "Tensor holds no memory. Call
+                //      Tensor::mutable_data
+                //      first.");
+                //  PADDLE_ENFORCE_LE(
+                //      numel() * SizeOfType(type()), memory_size(),
+                //      "Tensor's dims_ is out of bound. Call
+                //      Tensor::mutable_data "
+                //      "first to re-allocate memory.\n"
+                //      "or maybe the required data-type mismatches the data
+                //      already
+                //      stored.");
+            }
+
+            inline DataLayout layout() const { return layout_; }
+
+            inline void set_layout(const DataLayout layout) {
+                layout_ = layout;
+            }
+
+          private:
+            /**
+             * @note    Placeholder hides type T, so it doesn't appear as a
+             * template
+             *          parameter of Variable.
+             */
+            struct Placeholder {
+                virtual ~Placeholder() = default;
+
+                virtual void *ptr() const = 0;
+
+                virtual size_t size() const = 0;
+
+                virtual std::type_index type() const = 0;
+
+                virtual void set_type(std::type_index type) = 0;
+            };
+
+            struct PlaceholderImpl : public Placeholder {
+                PlaceholderImpl(size_t size, std::type_index type)
+                    : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
+                           memory::PODDeleter<uint8_t>()),
+                      size_(size), type_(type) {
+                    //                    PADDLE_ENFORCE_NOT_NULL(ptr_,
+                    //                    "Insufficient %s
+                    //                    memory to allocation.",
+                    //                                            (is_cpu_place(place_)
+                    //                                            ?
+                    //                                            "CPU" :
+                    //                                            "GPU"));
+                }
+
+                virtual size_t size() const { return size_; }
+
+                virtual void *ptr() const {
+                    return static_cast<void *>(ptr_.get());
+                }
+
+                virtual std::type_index type() const { return type_; }
+
+                virtual void set_type(std::type_index type) { type_ = type; }
+
+                /*! the pointer of memory block. */
+                std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
+
+                /*! the size of memory block. */
+                size_t size_;
+
+                /* the current type of memory */
+                std::type_index type_;
+            };
+
+            /*! holds the memory block if allocated. */
+            std::shared_ptr<Placeholder> holder_;
+
+            /**
+             * @brief points to elements dimensions.
+             *
+             * @note dims_ do not indicate the memory block size.
+             */
+
+            DDim dims_;
+
+            /**
+             * @brief the layout of memory block, default is NHWC.
+             *
+             * @note the memory allocation order, describe how weight/data is
+             * stored
+             *       For example, in 4-D Tensor(rank=4), there are three
+             * commonly
+             *       used layout. They are
+             *            NCHW, NHWC, CHWN.
+             *       N,C,H,W for respectively the batch size, the number of
+             *       feature maps, the height, the width.
+             */
+
+            DataLayout layout_ = DataLayout::kNHWC;
+
+            /**
+             * @brief   A PlaceHolder may be shared by more than one tensor.
+             *
+             * @note    Some of them may be slices of the others. So the offset_
+             *          is introduced here to indicate the byte offset between
+             *          PlaceHolder::ptr_ and where the tensor data really
+             * begins.
+             */
+            size_t offset_;
+        };
+
+        inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
+            Tensor res;
+            res.ShareDataWith(src);
+            res.Resize(flatten_to_2d(src.dims(), num_col_dims));
+            return res;
+        }
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/tensor_util.cc
+++ b/src/framework/tensor_util.cc
@@ -18,183 +18,189 @@
 #include <vector>

 namespace paddle_mobile {
-namespace framework {
-
-void TensorCopy(const Tensor &src, Tensor *dst) {
-  //  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to
-  //  "
-  //          << dst_place;
-  src.check_memory_size();
-
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = dst->mutable_data(src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-void TensorCopySync(const Tensor &src, Tensor *dst) {
-  //  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
-  //          << " to " << dst_place;
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(src.type());
-  auto size = src.numel() * SizeOfType(src.type());
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-template <typename Predicate> struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor &tensor_;
-  Tensor *out_;
-
-  AnyDTypeVisitor(Predicate predicate, const Tensor &tensor, Tensor *out)
-      : predicate_(predicate), tensor_(tensor), out_(out) {}
-
-  template <typename T> void operator()() const {
-    //    auto t = EigenVector<T>::Flatten(tensor_);
-    //    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    //    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-
-template <typename Predicate>
-inline void AnyImpl(Predicate predicate, const Tensor &tensor,
-                    framework::Tensor *out) {
-  VisitDataType(ToDataType(tensor.type()),
-                AnyDTypeVisitor<Predicate>(predicate, tensor, out));
-}
-
-template <typename Predicate> struct AnyVisitor {
-  const framework::Tensor &tensor_;
-  Predicate predicate_;
-
-  AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-
-  bool operator()(void) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>();
-    AnyImpl(predicate_, tensor_, &out);
-    return this->GetResult(out);
-  }
-
-  bool GetResult(const framework::Tensor &out) const {
-    return *out.data<bool>();
-  }
-};
-
-template <typename Predicate>
-inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  //  return platform::VisitPlace(visitor);
-  return visitor();
-}
-
-struct ContainsNANPredicate {
-  template <typename T>
-  auto operator()(const T &eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-
-bool TensorContainsNAN(const framework::Tensor &tensor) {
-  ContainsNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-struct ContainsInfPredicate {
-  template <typename T>
-  auto operator()(const T &eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-
-bool TensorContainsInf(const framework::Tensor &tensor) {
-  ContainsInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-void TensorToStream(std::ostream &os, const Tensor &tensor) {
-  { // the 1st field, uint32_t version
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-  }
-  { // the 2nd field, tensor description
-    // int32_t  size
-    // void*    protobuf message
-    proto::VarType::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
-    auto dims = framework::vectorize(tensor.dims());
-    auto *pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  { // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    auto *data_ptr = tensor.data<void>();
-    //    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-    //                   "Index overflow when writing tensor");
-
-    os.write(static_cast<const char *>(data_ptr),
-             static_cast<std::streamsize>(size));
-  }
-}
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void **buf, Tensor *tensor)
-      : buf_(buf), tensor_(tensor) {}
-
-  template <typename T> void operator()() {
-    *buf_ = tensor_->mutable_data<T>();
-  }
-
-  void **buf_;
-  Tensor *tensor_;
-};
-
-void TensorFromStream(std::istream &is, framework::Tensor *tensor) {
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  //  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  proto::VarType::TensorDesc desc;
-  { // int32_t size
-    // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char *>(buf.get()), size);
-    //    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-    //                   "Cannot parse tensor desc");
-  }
-  { // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-    void *buf;
-
-    framework::VisitDataType(desc.data_type(),
-                             DeserializedDataFunctor(&buf, tensor));
-    is.read(static_cast<char *>(buf), tensor->memory_size());
-  }
-}
-
-} // namespace framework
+    namespace framework {
+
+        void TensorCopy(const Tensor &src, Tensor *dst) {
+            //  VLOG(3) << "TensorCopy " << src.dims() << " from " <<
+            //  src.place() << " to
+            //  "
+            //          << dst_place;
+            src.check_memory_size();
+
+            dst->Resize(src.dims());
+            dst->set_layout(src.layout());
+            auto src_ptr = src.data<void>();
+
+            auto dst_ptr = dst->mutable_data(src.type());
+
+            auto size = src.numel() * SizeOfType(src.type());
+
+            memory::Copy(dst_ptr, src_ptr, size);
+        }
+
+        void TensorCopySync(const Tensor &src, Tensor *dst) {
+            //  VLOG(3) << "TensorCopySync " << src.dims() << " from " <<
+            //  src.place()
+            //          << " to " << dst_place;
+            src.check_memory_size();
+            dst->Resize(src.dims());
+            dst->set_layout(src.layout());
+            auto src_ptr = src.data<void>();
+            auto dst_ptr = dst->mutable_data(src.type());
+            auto size = src.numel() * SizeOfType(src.type());
+            memory::Copy(dst_ptr, src_ptr, size);
+        }
+
+        template <typename Predicate> struct AnyDTypeVisitor {
+            Predicate predicate_;
+            const Tensor &tensor_;
+            Tensor *out_;
+
+            AnyDTypeVisitor(Predicate predicate, const Tensor &tensor,
+                            Tensor *out)
+                : predicate_(predicate), tensor_(tensor), out_(out) {}
+
+            template <typename T> void operator()() const {
+                //    auto t = EigenVector<T>::Flatten(tensor_);
+                //    auto o = EigenScalar<bool>::From(*out_);
+                // return any of predicate_(t) is true.
+                //    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+            }
+        };
+
+        template <typename Predicate>
+        inline void AnyImpl(Predicate predicate, const Tensor &tensor,
+                            framework::Tensor *out) {
+            VisitDataType(ToDataType(tensor.type()),
+                          AnyDTypeVisitor<Predicate>(predicate, tensor, out));
+        }
+
+        template <typename Predicate> struct AnyVisitor {
+            const framework::Tensor &tensor_;
+            Predicate predicate_;
+
+            AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
+                : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+            bool operator()(void) const {
+                framework::Tensor out;
+                out.Resize({1});
+                out.mutable_data<bool>();
+                AnyImpl(predicate_, tensor_, &out);
+                return this->GetResult(out);
+            }
+
+            bool GetResult(const framework::Tensor &out) const {
+                return *out.data<bool>();
+            }
+        };
+
+        template <typename Predicate>
+        inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
+            AnyVisitor<Predicate> visitor(tensor, predicate);
+            //  return platform::VisitPlace(visitor);
+            return visitor();
+        }
+
+        struct ContainsNANPredicate {
+            template <typename T>
+            auto operator()(const T &eigen_vec) const
+                -> decltype(std::declval<T>().isnan()) {
+                // Cast eigen_vector to vector of bool. true if is inf.
+                return eigen_vec.isnan();
+            }
+        };
+
+        bool TensorContainsNAN(const framework::Tensor &tensor) {
+            ContainsNANPredicate predicate;
+            return Any(tensor, predicate);
+        }
+
+        struct ContainsInfPredicate {
+            template <typename T>
+            auto operator()(const T &eigen_vec) const
+                -> decltype(std::declval<T>().isinf()) {
+                // Cast eigen_vector to vector of bool. true if is inf.
+                return eigen_vec.isinf();
+            }
+        };
+
+        bool TensorContainsInf(const framework::Tensor &tensor) {
+            ContainsInfPredicate predicate;
+            return Any(tensor, predicate);
+        }
+
+        void TensorToStream(std::ostream &os, const Tensor &tensor) {
+            { // the 1st field, uint32_t version
+                constexpr uint32_t version = 0;
+                os.write(reinterpret_cast<const char *>(&version),
+                         sizeof(version));
+            }
+            { // the 2nd field, tensor description
+                // int32_t  size
+                // void*    protobuf message
+                proto::VarType::TensorDesc desc;
+                desc.set_data_type(framework::ToDataType(tensor.type()));
+                auto dims = framework::vectorize(tensor.dims());
+                auto *pb_dims = desc.mutable_dims();
+                pb_dims->Resize(static_cast<int>(dims.size()), 0);
+                std::copy(dims.begin(), dims.end(), pb_dims->begin());
+                int32_t size = desc.ByteSize();
+                os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+                auto out = desc.SerializeAsString();
+                os.write(out.data(), size);
+            }
+            { // the 3rd field, tensor data
+                uint64_t size = tensor.memory_size();
+                auto *data_ptr = tensor.data<void>();
+                //    PADDLE_ENFORCE(size <
+                //    std::numeric_limits<std::streamsize>::max(),
+                //                   "Index overflow when writing tensor");
+
+                os.write(static_cast<const char *>(data_ptr),
+                         static_cast<std::streamsize>(size));
+            }
+        }
+
+        struct DeserializedDataFunctor {
+            DeserializedDataFunctor(void **buf, Tensor *tensor)
+                : buf_(buf), tensor_(tensor) {}
+
+            template <typename T> void operator()() {
+                *buf_ = tensor_->mutable_data<T>();
+            }
+
+            void **buf_;
+            Tensor *tensor_;
+        };
+
+        void TensorFromStream(std::istream &is, framework::Tensor *tensor) {
+            uint32_t version;
+            is.read(reinterpret_cast<char *>(&version), sizeof(version));
+            //  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+            proto::VarType::TensorDesc desc;
+            { // int32_t size
+                // proto buffer
+                int32_t size;
+                is.read(reinterpret_cast<char *>(&size), sizeof(size));
+                std::unique_ptr<char[]> buf(new char[size]);
+                is.read(reinterpret_cast<char *>(buf.get()), size);
+                //    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                //                   "Cannot parse tensor desc");
+            }
+            { // read tensor
+                std::vector<int64_t> dims;
+                dims.reserve(static_cast<size_t>(desc.dims().size()));
+                std::copy(desc.dims().begin(), desc.dims().end(),
+                          std::back_inserter(dims));
+                tensor->Resize(framework::make_ddim(dims));
+                void *buf;
+
+                framework::VisitDataType(desc.data_type(),
+                                         DeserializedDataFunctor(&buf, tensor));
+                is.read(static_cast<char *>(buf), tensor->memory_size());
+            }
+        }
+
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/tensor_util.h
+++ b/src/framework/tensor_util.h
@@ -20,47 +20,47 @@ limitations under the License. */
 #include <vector>

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-void TensorCopy(const Tensor &src, Tensor *dst);
-void TensorCopySync(const Tensor &src, Tensor *dst);
+        void TensorCopy(const Tensor &src, Tensor *dst);
+        void TensorCopySync(const Tensor &src, Tensor *dst);

-template <typename T>
-void TensorFromVector(const std::vector<T> &src, Tensor *dst);
+        template <typename T>
+        void TensorFromVector(const std::vector<T> &src, Tensor *dst);

-template <typename T>
-void TesnorToVector(const Tensor &src, std::vector<T> *dst);
+        template <typename T>
+        void TesnorToVector(const Tensor &src, std::vector<T> *dst);

-bool TensorContainsNAN(const framework::Tensor &tensor);
-bool TensorContainsInf(const framework::Tensor &tensor);
+        bool TensorContainsNAN(const framework::Tensor &tensor);
+        bool TensorContainsInf(const framework::Tensor &tensor);

-void TensorToStream(std::ostream &os, const Tensor &tensor);
-void TensorFromStream(std::istream &is, Tensor *tensor);
+        void TensorToStream(std::ostream &os, const Tensor &tensor);
+        void TensorFromStream(std::istream &is, Tensor *tensor);

-//
-// The implementation of template functions.
-//
+        //
+        // The implementation of template functions.
+        //

-template <typename T>
-void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
-  auto src_ptr = static_cast<const void *>(src.data());
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
-  auto size = src.size() * sizeof(T);
+        template <typename T>
+        void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
+            auto src_ptr = static_cast<const void *>(src.data());
+            dst->Resize({static_cast<int64_t>(src.size())});
+            auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
+            auto size = src.size() * sizeof(T);

-  memory::Copy(dst_ptr, src_ptr, size);
-}
+            memory::Copy(dst_ptr, src_ptr, size);
+        }

-template <typename T>
-void TensorToVector(const Tensor &src, std::vector<T> *dst) {
-  auto src_ptr = static_cast<const void *>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
+        template <typename T>
+        void TensorToVector(const Tensor &src, std::vector<T> *dst) {
+            auto src_ptr = static_cast<const void *>(src.data<T>());
+            auto size = src.numel() * sizeof(T);

-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void *>(dst->data());
+            dst->resize(src.numel());
+            auto dst_ptr = static_cast<void *>(dst->data());

-  memory::Copy(dst_ptr, src_ptr, size);
-}
+            memory::Copy(dst_ptr, src_ptr, size);
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/var_desc.cpp
+++ b/src/framework/var_desc.cpp
@@ -20,9 +20,9 @@ SOFTWARE.

 namespace paddle_mobile {

-namespace framework {
+    namespace framework {

-VarDesc::VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
+        VarDesc::VarDesc(const proto::VarDesc &desc) : desc_(desc) {}

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/var_desc.h
+++ b/src/framework/var_desc.h
@@ -22,67 +22,68 @@ SOFTWARE.
 #include "paddle_mobile_object.h"

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-class VarDesc {
-public:
-  VarDesc(const proto::VarDesc &desc);
+        class VarDesc {
+          public:
+            VarDesc(const proto::VarDesc &desc);

-  std::string Name() const { return desc_.name(); }
+            std::string Name() const { return desc_.name(); }

-  proto::VarType::Type GetType() const { return desc_.type().type(); }
+            proto::VarType::Type GetType() const { return desc_.type().type(); }

-  bool Persistable() const { return desc_.persistable(); }
+            bool Persistable() const { return desc_.persistable(); }

-  const proto::VarType::ChannelDesc &channel_desc() const {
-    switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.type().channel();
-    default:
-      break;
-    }
-  }
+            const proto::VarType::ChannelDesc &channel_desc() const {
+                switch (desc_.type().type()) {
+                case proto::VarType::CHANNEL:
+                    return desc_.type().channel();
+                default:
+                    break;
+                }
+            }

-  const proto::VarType::TensorDesc &tensor_desc() const {
-    switch (desc_.type().type()) {
-    case proto::VarType::SELECTED_ROWS:
-      return desc_.type().selected_rows();
-    case proto::VarType::LOD_TENSOR:
-      return desc_.type().lod_tensor().tensor();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_.type().tensor_array().tensor();
-    default:
-      break;
-    }
-  }
+            const proto::VarType::TensorDesc &tensor_desc() const {
+                switch (desc_.type().type()) {
+                case proto::VarType::SELECTED_ROWS:
+                    return desc_.type().selected_rows();
+                case proto::VarType::LOD_TENSOR:
+                    return desc_.type().lod_tensor().tensor();
+                case proto::VarType::LOD_TENSOR_ARRAY:
+                    return desc_.type().tensor_array().tensor();
+                default:
+                    break;
+                }
+            }

-  proto::VarType::Type GetDataType() const {
-    switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return channel_desc().data_type();
-      break;
-    default:
-      return tensor_desc().data_type();
-    }
-  }
+            proto::VarType::Type GetDataType() const {
+                switch (desc_.type().type()) {
+                case proto::VarType::CHANNEL:
+                    return channel_desc().data_type();
+                    break;
+                default:
+                    return tensor_desc().data_type();
+                }
+            }

-  template <typename T>
-  std::vector<T> RepeatedToVector(
-      const google::protobuf::RepeatedField<T> &repeated_field) const {
-    std::vector<T> ret;
-    ret.reserve(repeated_field.size());
-    std::copy(repeated_field.begin(), repeated_field.end(),
-              std::back_inserter(ret));
-    return ret;
-  }
+            template <typename T>
+            std::vector<T> RepeatedToVector(
+                const google::protobuf::RepeatedField<T> &repeated_field)
+                const {
+                std::vector<T> ret;
+                ret.reserve(repeated_field.size());
+                std::copy(repeated_field.begin(), repeated_field.end(),
+                          std::back_inserter(ret));
+                return ret;
+            }

-  std::vector<int64_t> GetShape() const {
-    return this->RepeatedToVector(tensor_desc().dims());
-  }
+            std::vector<int64_t> GetShape() const {
+                return this->RepeatedToVector(tensor_desc().dims());
+            }

-private:
-  proto::VarDesc desc_;
-};
+          private:
+            proto::VarDesc desc_;
+        };

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/var_type.h
+++ b/src/framework/var_type.h
@@ -23,16 +23,17 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-namespace framework {
-inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return proto::VarType_Type_LOD_TENSOR;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return proto::VarType_Type_SELECTED_ROWS;
-  } else {
-    //    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
-  }
-}
+    namespace framework {
+        inline proto::VarType::Type ToVarType(std::type_index type) {
+            if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+                return proto::VarType_Type_LOD_TENSOR;
+            } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+                return proto::VarType_Type_SELECTED_ROWS;
+            } else {
+                //    PADDLE_THROW("ToVarType:Unsupported type %s",
+                //    type.name());
+            }
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -26,72 +26,71 @@ SOFTWARE.
 #include <typeinfo>

 namespace paddle_mobile {
-namespace framework {
-class Variable : public PaddleMobileObject {
-public:
-  Variable() {}
-  ~Variable() {}
-
-  template <typename T> const T *Get() const {
-    return static_cast<const T *>(holder_->Ptr());
-  }
-
-  bool IsInitialized() const { return holder_ != nullptr; }
-
-  const std::string *Name() { return name_; }
-
-  template <typename T> T *GetMutable() {
-    if (!IsType<T>()) {
-      if (*Name() == "pixel") {
-        //        std::cout << " reset " << *Name() << std::endl;
-      }
-      holder_.reset(new PlaceholderImp<T>(new T()));
-    }
-    return static_cast<T *>(holder_->Ptr());
-  }
-
-  template <typename T> bool IsType() const {
-    if (holder_) {
-      //                printf("not null \n");
-      printf(" holder type : %s, this type %s \n", holder_->Type().name(),
-             typeid(T).name());
-    }
-
-    //              std::cout << " " << holder_->Type() << " " <<  typeid(T) <<
-    //              std::endl;
-    return holder_ != nullptr && holder_->Type() == typeid(T);
-  }
-
-  void Clear() { holder_.reset(); }
-
-  std::type_index Type() const { return holder_->Type(); }
-
-  void SetName(const std::string *name) { name_ = name; }
-
-private:
-  struct Placeholder {
-    Placeholder() = default;
-    virtual ~Placeholder() = default;
-
-    virtual const std::type_info &Type() const = 0;
-    virtual void *Ptr() const = 0;
-  };
-
-  template <typename T> struct PlaceholderImp : public Placeholder {
-    explicit PlaceholderImp(T *ptr) : ptr_(ptr), type_(typeid(T)) {}
-
-    virtual const std::type_info &Type() const { return type_; }
-    virtual void *Ptr() const override {
-      return static_cast<void *>(ptr_.get());
-    }
-
-    std::unique_ptr<T> ptr_;
-    const std::type_info &type_;
-  };
-
-  std::unique_ptr<Placeholder> holder_;
-  friend class Scope;
-  const std::string *name_;
-};
-} // namespace framework
+    namespace framework {
+        class Variable : public PaddleMobileObject {
+          public:
+            template <typename T> const T *Get() const {
+                return static_cast<const T *>(holder_->Ptr());
+            }
+
+            bool IsInitialized() const { return holder_ != nullptr; }
+
+            const std::string *Name() { return name_; }
+
+            template <typename T> T *GetMutable() {
+                if (!IsType<T>()) {
+                    if (*Name() == "pixel") {
+                        //        std::cout << " reset " << *Name() <<
+                        //        std::endl;
+                    }
+                    holder_.reset(new PlaceholderImp<T>(new T()));
+                }
+                return static_cast<T *>(holder_->Ptr());
+            }
+
+            template <typename T> bool IsType() const {
+                if (holder_) {
+                    //                printf("not null \n");
+                    printf(" holder type : %s, this type %s \n",
+                           holder_->Type().name(), typeid(T).name());
+                }
+
+                //              std::cout << " " << holder_->Type() << " " <<
+                //              typeid(T) <<
+                //              std::endl;
+                return holder_ != nullptr && holder_->Type() == typeid(T);
+            }
+
+            void Clear() { holder_.reset(); }
+
+            std::type_index Type() const { return holder_->Type(); }
+
+            void SetName(const std::string *name) { name_ = name; }
+
+          private:
+            struct Placeholder {
+                Placeholder() = default;
+                virtual ~Placeholder() = default;
+
+                virtual const std::type_info &Type() const = 0;
+                virtual void *Ptr() const = 0;
+            };
+
+            template <typename T> struct PlaceholderImp : public Placeholder {
+                explicit PlaceholderImp(T *ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+                virtual const std::type_info &Type() const { return type_; }
+                virtual void *Ptr() const override {
+                    return static_cast<void *>(ptr_.get());
+                }
+
+                std::unique_ptr<T> ptr_;
+                const std::type_info &type_;
+            };
+
+            std::unique_ptr<Placeholder> holder_;
+            friend class Scope;
+            const std::string *name_;
+        };
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -29,359 +29,405 @@ SOFTWARE.

 namespace paddle_mobile {

-void ReadBinaryFile(const std::string &filename, std::string *contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-template <typename Dtype, Precision P>
-void Loader<Dtype, P>::LoadVar(framework::LoDTensor *tensor,
-                               const std::string &file_path) {
-
-  LOG(kLOG_DEBUG) << "  to load " << file_path;
-  //  Log(kLOG_DEBUG) << "123";
-
-  std::ifstream is(file_path);
-
-  std::streampos pos = is.tellg(); //   save   current   position
-  is.seekg(0, std::ios::end);
-  LOG(kLOG_DEBUG) << "  file length = " << is.tellg();
-  is.seekg(pos); //   restore   saved   position
-
-  // 1. version
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  LOG(kLOG_INFO) << "   version: " << version;
-
-  // 2 Lod information
-  uint64_t lod_level;
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-  LOG(kLOG_DEBUG) << "   load level: " << lod_level;
-  LOG(kLOG_DEBUG) << "   lod info: ";
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
-    for (int j = 0; j < tmp.size(); ++j) {
-      LOG(kLOG_DEBUG1) << "    lod - " << tmp[j];
+    void ReadBinaryFile(const std::string &filename, std::string *contents) {
+        std::ifstream fin(filename, std::ios::in | std::ios::binary);
+        fin.seekg(0, std::ios::end);
+        contents->clear();
+        contents->resize(fin.tellg());
+        fin.seekg(0, std::ios::beg);
+        fin.read(&(contents->at(0)), contents->size());
+        fin.close();
    }
-    lod[i] = tmp;
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
-  //  std::cout << "   tensor_version: " << tensor_version << std::endl;
-
-  // 4. tensor desc
-  int32_t size;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
-  //  std::cout << "   tensor desc size: " << size << std::endl;
-  std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
-
-  framework::proto::VarType::TensorDesc desc;
-  desc.ParseFromArray(buf.get(), size);
-
-  //  std::cout << "   desc dims size " << desc.dims().size() << std::endl;
-  int memory_size = 1;
-  for (int l = 0; l < desc.dims().size(); ++l) {
-    //    std::cout << "    dim " << l << " value: " << desc.dims()[l] <<
-    //    std::endl;
-    memory_size *= desc.dims()[l];
-  }
-
-  std::vector<int64_t> dims;
-  dims.reserve(static_cast<size_t>(desc.dims().size()));
-  std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-  tensor->Resize(framework::make_ddim(dims));
-
-  void *memory;
-  int type_size = 0;
-  //  std::cout << "    desc pre type: ";
-  switch (desc.data_type()) {
-  case framework::proto::VarType::FP16:
-    //      std::cout << "FP16" << std::endl;
-    type_size = 2;
-    break;
-  case framework::proto::VarType::FP32:
-    type_size = 4;
-    memory = tensor->mutable_data<float>();
-    //      std::cout << "FP32" << std::endl;
-    break;
-  case framework::proto::VarType::FP64:
-    type_size = 8;
-    //      std::cout << "FP64" << std::endl;
-    break;
-  case framework::proto::VarType::INT32:
-    type_size = 4;
-    //      std::cout << "INT32" << std::endl;
-    break;
-  case framework::proto::VarType::INT64:
-    type_size = 8;
-    //      std::cout << "INT64" << std::endl;
-    break;
-  case framework::proto::VarType::BOOL:
-    type_size = 1;
-    //      std::cout << "BOOL" << std::endl;
-    break;
-  default:
-    break;
-    //      std::cout << "    not support" << std::endl;
-  }
-
-  //  std::cout << "    malloc size: " << memory_size * type_size << std::endl;
-  is.read(static_cast<char *>(memory), memory_size * type_size);
-  //  std::cout << "    memory: " << memory << std::endl;
-  is.close();
-};
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P>
-Loader<Dtype, P>::Load(const std::string &dirname) {
-  std::string model_filename = dirname + "/__model__";
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-  framework::proto::ProgramDesc program_desc_proto;
-  program_desc_proto.ParseFromString(program_desc_str);
-
-  std::shared_ptr<framework::ProgramDesc> originProgramDesc =
-      std::make_shared<framework::ProgramDesc>(program_desc_proto);
-
-  framework::Program<Dtype, P> program;
-  program.originProgram = originProgramDesc;
-
-  std::shared_ptr<framework::Scope> scope =
-      std::make_shared<framework::Scope>();
-  program.scope = scope;
-
-  auto block = originProgramDesc->Block(0);
-
-  for (auto block : originProgramDesc->Blocks()) {
-    //    std::cout << "for block" << std::endl;
-    for (int i = 0; i < block->Vars().size(); ++i) {
-      std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
-      auto var = scope->Var(var_desc->Name());
-      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-        if (var_desc->Persistable() &&
-            var_desc->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-            var_desc->GetType() != framework::proto::VarType::FETCH_LIST) {
-          framework::LoDTensor *tensor =
-              var->GetMutable<framework::LoDTensor>();
-          // to load
-          LoadVar(tensor, dirname + "/" + var_desc->Name());
-        }
-      } else {
-        //        std::cout << "非 lod" << std::endl;
-      }
-    }
-  }

-#ifdef PADDLE_MOBILE_DEBUG
-  for (int i = 0; i < program_desc_proto.blocks().size(); ++i) {
-    framework::proto::BlockDesc block = program_desc_proto.blocks()[i];
-    //    std::cout << "block: " << block.idx() << std::endl;
-    for (int j = 0; j < block.ops().size(); ++j) {
-      framework::proto::OpDesc op = block.ops()[j];
-
-      //      std::cout << " op: " << op.type() << std::endl;
-      for (int m = 0; m < op.inputs_size(); ++m) {
-        const framework::proto::OpDesc::Var &var = op.inputs(m);
-        //        std::cout << "  input parameter: " << var.parameter() <<
-        //        std::endl;
-        for (int n = 0; n < var.arguments().size(); ++n) {
-          //          std::cout << "   argument - " << var.arguments()[n] <<
-          //          std::endl;
-        }
-      }
-
-      for (int y = 0; y < op.outputs_size(); ++y) {
-        const framework::proto::OpDesc::Var &var = op.outputs(y);
-        //        std::cout << "  output parameter: " << var.parameter() <<
-        //        std::endl;
-        for (int z = 0; z < var.arguments().size(); ++z) {
-          //          std::cout << "   argument - " << var.arguments()[z] <<
-          //          std::endl;
-        }
-      }
-
-      for (int x = 0; x < op.attrs().size(); ++x) {
-        const framework::proto::OpDesc_Attr attr = op.attrs()[x];
-        //        std::cout << "  attr name: " << attr.name() << std::endl;
-        //        std::cout << "  attr type: " << attr.type() << std::endl;
-
-        switch (attr.type()) {
-        case framework::proto::AttrType::BOOLEAN:
-          //            std::cout << "   boolen: " << attr.b() << std::endl;
-          break;
-        case framework::proto::AttrType::INT:
-          //            std::cout << "   int: " << attr.i() << std::endl;
-          break;
-        case framework::proto::AttrType::FLOAT:
-        //            std::cout << "   float: " << attr.f() << std::endl;
-        case framework::proto::AttrType::STRING:
-        //            std::cout << "   string: " << attr.s() << std::endl;
-        case framework::proto::AttrType::BOOLEANS:
-          //                            std::vector<bool>
-          //                            bools(attr.bools_size());
-          for (int y = 0; y < attr.bools_size(); ++y) {
-            //              std::cout << "   bool - " << attr.bools(y) <<
-            //              std::endl;
-          }
-        case framework::proto::AttrType::LONG:
-        //            std::cout << "   long: " << attr.l() << std::endl;
-        case framework::proto::AttrType::FLOATS:
-          for (int y = 0; y < attr.floats_size(); ++y) {
-            //              std::cout << "   float - " << y << ": " <<
-            //              attr.floats(y)
-            //                        << std::endl;
-          }
-        case framework::proto::AttrType::INTS:
-          for (int y = 0; y < attr.ints_size(); ++y) {
-            //              std::cout << "   int - " << y << ": " <<
-            //              attr.ints(y)
-            //                        << std::endl;
-          }
-        case framework::proto::AttrType::STRINGS:
-          for (int y = 0; y < attr.strings_size(); ++y) {
-            //              std::cout << "   string - " << y << ": " <<
-            //              attr.strings(y)
-            //                        << std::endl;
-          }
-        }
-      }
-    }
+    template <typename Dtype, Precision P>
+    void Loader<Dtype, P>::LoadVar(framework::LoDTensor *tensor,
+                                   const std::string &file_path) {

-    for (int k = 0; k < block.vars().size(); ++k) {
-      framework::proto::VarDesc var = block.vars()[k];
-      if (var.type().type() == framework::proto::VarType::LOD_TENSOR) {
-        //        std::cout << " var name: " << var.name() << std::endl;
-        const framework::proto::VarType::TensorDesc &tensor_desc =
-            var.type().lod_tensor().tensor();
-        //        std::cout << "  in var tensor desc dims size "
-        //                  << tensor_desc.dims().size() << std::endl;
-        int memory_size = 1;
-        for (int l = 0; l < tensor_desc.dims().size(); ++l) {
-          //          std::cout << " var tensor desc dim " << l
-          //                    << " value: " << tensor_desc.dims()[l] <<
-          //                    std::endl;
-        }
-      }
+        LOG(kLOG_DEBUG) << "  to load " << file_path;
+        //  Log(kLOG_DEBUG) << "123";

-      if (var.persistable() &&
-          var.type().type() != framework::proto::VarType::FEED_MINIBATCH &&
-          var.type().type() != framework::proto::VarType::FETCH_LIST) {
-        //        std::cout << "  to load " << var.name() << std::endl;
-        std::string file_path = dirname + "/" + var.name();
        std::ifstream is(file_path);
+
        std::streampos pos = is.tellg(); //   save   current   position
        is.seekg(0, std::ios::end);
-        //        std::cout << "  file length = " << is.tellg() << std::endl;
+        LOG(kLOG_DEBUG) << "  file length = " << is.tellg();
        is.seekg(pos); //   restore   saved   position

        // 1. version
        uint32_t version;
        is.read(reinterpret_cast<char *>(&version), sizeof(version));
-        //        std::cout << "   version: " << version << std::endl;
+        LOG(kLOG_INFO) << "   version: " << version;

        // 2 Lod information
        uint64_t lod_level;
        is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-        //        std::cout << "   load level: " << lod_level << std::endl;
-        //        std::cout << "   lod info: " << std::endl;
+        LOG(kLOG_DEBUG) << "   load level: " << lod_level;
+        LOG(kLOG_DEBUG) << "   lod info: ";
+        auto &lod = *tensor->mutable_lod();
+        lod.resize(lod_level);
        for (uint64_t i = 0; i < lod_level; ++i) {
-          uint64_t size;
-          is.read(reinterpret_cast<char *>(&size), sizeof(size));
-          std::vector<size_t> tmp(size / sizeof(size_t));
-          is.read(reinterpret_cast<char *>(tmp.data()),
-                  static_cast<std::streamsize>(size));
-          for (int j = 0; j < tmp.size(); ++j) {
-            //            std::cout << "    lod - " << tmp[j] << std::endl;
-          }
+            uint64_t size;
+            is.read(reinterpret_cast<char *>(&size), sizeof(size));
+            std::vector<size_t> tmp(size / sizeof(size_t));
+            is.read(reinterpret_cast<char *>(tmp.data()),
+                    static_cast<std::streamsize>(size));
+            for (int j = 0; j < tmp.size(); ++j) {
+                LOG(kLOG_DEBUG1) << "    lod - " << tmp[j];
+            }
+            lod[i] = tmp;
        }

+        // 3. tensor version
        uint32_t tensor_version;
-        is.read(reinterpret_cast<char *>(&version), sizeof(version));
-        //        std::cout << "   tensor_version: " << tensor_version <<
-        //        std::endl;
+        is.read(reinterpret_cast<char *>(&tensor_version),
+                sizeof(tensor_version));
+        //  std::cout << "   tensor_version: " << tensor_version << std::endl;

+        // 4. tensor desc
        int32_t size;
        is.read(reinterpret_cast<char *>(&size), sizeof(size));
-        //        std::cout << "   tensor desc size: " << size << std::endl;
+        //  std::cout << "   tensor desc size: " << size << std::endl;
        std::unique_ptr<char[]> buf(new char[size]);
        is.read(reinterpret_cast<char *>(buf.get()), size);

        framework::proto::VarType::TensorDesc desc;
        desc.ParseFromArray(buf.get(), size);

-        //        std::cout << "   desc dims size " << desc.dims().size() <<
-        //        std::endl;
+        //  std::cout << "   desc dims size " << desc.dims().size() <<
+        //  std::endl;
        int memory_size = 1;
        for (int l = 0; l < desc.dims().size(); ++l) {
-          //          std::cout << "    dim " << l << " value: " <<
-          //          desc.dims()[l]
-          //                    << std::endl;
-          memory_size *= desc.dims()[l];
+            //    std::cout << "    dim " << l << " value: " << desc.dims()[l]
+            //    <<
+            //    std::endl;
+            memory_size *= desc.dims()[l];
        }

+        std::vector<int64_t> dims;
+        dims.reserve(static_cast<size_t>(desc.dims().size()));
+        std::copy(desc.dims().begin(), desc.dims().end(),
+                  std::back_inserter(dims));
+        tensor->Resize(framework::make_ddim(dims));
+
+        void *memory;
        int type_size = 0;
-        //        std::cout << "    desc pre type: ";
+        //  std::cout << "    desc pre type: ";
        switch (desc.data_type()) {
        case framework::proto::VarType::FP16:
-          //            std::cout << "FP16" << std::endl;
-          type_size = 2;
-          break;
+            //      std::cout << "FP16" << std::endl;
+            type_size = 2;
+            break;
        case framework::proto::VarType::FP32:
-          type_size = 4;
-          //            std::cout << "FP32" << std::endl;
-          break;
+            type_size = 4;
+            memory = tensor->mutable_data<float>();
+            //      std::cout << "FP32" << std::endl;
+            break;
        case framework::proto::VarType::FP64:
-          type_size = 8;
-          //            std::cout << "FP64" << std::endl;
-          break;
+            type_size = 8;
+            //      std::cout << "FP64" << std::endl;
+            break;
        case framework::proto::VarType::INT32:
-          type_size = 4;
-          //            std::cout << "INT32" << std::endl;
-          break;
+            type_size = 4;
+            //      std::cout << "INT32" << std::endl;
+            break;
        case framework::proto::VarType::INT64:
-          type_size = 8;
-          //            std::cout << "INT64" << std::endl;
-          break;
+            type_size = 8;
+            //      std::cout << "INT64" << std::endl;
+            break;
        case framework::proto::VarType::BOOL:
-          type_size = 1;
-          //            std::cout << "BOOL" << std::endl;
-          break;
+            type_size = 1;
+            //      std::cout << "BOOL" << std::endl;
+            break;
        default:
-          break;
-          //            std::cout << "    not support" << std::endl;
+            break;
+            //      std::cout << "    not support" << std::endl;
        }

-        //        std::cout << "    malloc size: " << memory_size * type_size
-        //                  << std::endl;
-        void *memory = malloc(memory_size * type_size);
+        //  std::cout << "    malloc size: " << memory_size * type_size <<
+        //  std::endl;
        is.read(static_cast<char *>(memory), memory_size * type_size);
-        //        std::cout << "    memory: " << memory << std::endl;
+        //  std::cout << "    memory: " << memory << std::endl;
        is.close();
-      } else {
-        //        std::cout << "  *not load "
-        //                  << " var : " << var.name() << std::endl;
-      }
-    }
-  }
+    };
+
+    template <typename Dtype, Precision P>
+    const framework::Program<Dtype, P>
+    Loader<Dtype, P>::Load(const std::string &dirname) {
+        std::string model_filename = dirname + "/__model__";
+        std::string program_desc_str;
+        ReadBinaryFile(model_filename, &program_desc_str);
+        framework::proto::ProgramDesc program_desc_proto;
+        program_desc_proto.ParseFromString(program_desc_str);
+
+        std::shared_ptr<framework::ProgramDesc> originProgramDesc =
+            std::make_shared<framework::ProgramDesc>(program_desc_proto);
+
+        framework::Program<Dtype, P> program;
+        program.originProgram = originProgramDesc;
+
+        std::shared_ptr<framework::Scope> scope =
+            std::make_shared<framework::Scope>();
+        program.scope = scope;
+
+        auto block = originProgramDesc->Block(0);
+
+        for (auto block : originProgramDesc->Blocks()) {
+            //    std::cout << "for block" << std::endl;
+            for (int i = 0; i < block->Vars().size(); ++i) {
+                std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
+                auto var = scope->Var(var_desc->Name());
+                if (var_desc->GetType() ==
+                    framework::proto::VarType::LOD_TENSOR) {
+                    if (var_desc->Persistable() &&
+                        var_desc->GetType() !=
+                            framework::proto::VarType::FEED_MINIBATCH &&
+                        var_desc->GetType() !=
+                            framework::proto::VarType::FETCH_LIST) {
+                        framework::LoDTensor *tensor =
+                            var->GetMutable<framework::LoDTensor>();
+                        // to load
+                        LoadVar(tensor, dirname + "/" + var_desc->Name());
+                    }
+                } else {
+                    //        std::cout << "非 lod" << std::endl;
+                }
+            }
+        }
+
+#ifdef PADDLE_MOBILE_DEBUG
+        for (int i = 0; i < program_desc_proto.blocks().size(); ++i) {
+            framework::proto::BlockDesc block = program_desc_proto.blocks()[i];
+            //    std::cout << "block: " << block.idx() << std::endl;
+            for (int j = 0; j < block.ops().size(); ++j) {
+                framework::proto::OpDesc op = block.ops()[j];
+
+                //      std::cout << " op: " << op.type() << std::endl;
+                for (int m = 0; m < op.inputs_size(); ++m) {
+                    const framework::proto::OpDesc::Var &var = op.inputs(m);
+                    //        std::cout << "  input parameter: " <<
+                    //        var.parameter() <<
+                    //        std::endl;
+                    for (int n = 0; n < var.arguments().size(); ++n) {
+                        //          std::cout << "   argument - " <<
+                        //          var.arguments()[n] <<
+                        //          std::endl;
+                    }
+                }
+
+                for (int y = 0; y < op.outputs_size(); ++y) {
+                    const framework::proto::OpDesc::Var &var = op.outputs(y);
+                    //        std::cout << "  output parameter: " <<
+                    //        var.parameter() <<
+                    //        std::endl;
+                    for (int z = 0; z < var.arguments().size(); ++z) {
+                        //          std::cout << "   argument - " <<
+                        //          var.arguments()[z] <<
+                        //          std::endl;
+                    }
+                }
+
+                for (int x = 0; x < op.attrs().size(); ++x) {
+                    const framework::proto::OpDesc_Attr attr = op.attrs()[x];
+                    //        std::cout << "  attr name: " << attr.name() <<
+                    //        std::endl;
+                    //        std::cout << "  attr type: " << attr.type() <<
+                    //        std::endl;
+
+                    switch (attr.type()) {
+                    case framework::proto::AttrType::BOOLEAN:
+                        //            std::cout << "   boolen: " << attr.b() <<
+                        //            std::endl;
+                        break;
+                    case framework::proto::AttrType::INT:
+                        //            std::cout << "   int: " << attr.i() <<
+                        //            std::endl;
+                        break;
+                    case framework::proto::AttrType::FLOAT:
+                    //            std::cout << "   float: " << attr.f() <<
+                    //            std::endl;
+                    case framework::proto::AttrType::STRING:
+                    //            std::cout << "   string: " << attr.s() <<
+                    //            std::endl;
+                    case framework::proto::AttrType::BOOLEANS:
+                        //                            std::vector<bool>
+                        //                            bools(attr.bools_size());
+                        for (int y = 0; y < attr.bools_size(); ++y) {
+                            //              std::cout << "   bool - " <<
+                            //              attr.bools(y) <<
+                            //              std::endl;
+                        }
+                    case framework::proto::AttrType::LONG:
+                    //            std::cout << "   long: " << attr.l() <<
+                    //            std::endl;
+                    case framework::proto::AttrType::FLOATS:
+                        for (int y = 0; y < attr.floats_size(); ++y) {
+                            //              std::cout << "   float - " << y <<
+                            //              ": " <<
+                            //              attr.floats(y)
+                            //                        << std::endl;
+                        }
+                    case framework::proto::AttrType::INTS:
+                        for (int y = 0; y < attr.ints_size(); ++y) {
+                            //              std::cout << "   int - " << y << ":
+                            //              " <<
+                            //              attr.ints(y)
+                            //                        << std::endl;
+                        }
+                    case framework::proto::AttrType::STRINGS:
+                        for (int y = 0; y < attr.strings_size(); ++y) {
+                            //              std::cout << "   string - " << y <<
+                            //              ": " <<
+                            //              attr.strings(y)
+                            //                        << std::endl;
+                        }
+                    }
+                }
+            }
+
+            for (int k = 0; k < block.vars().size(); ++k) {
+                framework::proto::VarDesc var = block.vars()[k];
+                if (var.type().type() ==
+                    framework::proto::VarType::LOD_TENSOR) {
+                    //        std::cout << " var name: " << var.name() <<
+                    //        std::endl;
+                    const framework::proto::VarType::TensorDesc &tensor_desc =
+                        var.type().lod_tensor().tensor();
+                    //        std::cout << "  in var tensor desc dims size "
+                    //                  << tensor_desc.dims().size() <<
+                    //                  std::endl;
+                    int memory_size = 1;
+                    for (int l = 0; l < tensor_desc.dims().size(); ++l) {
+                        //          std::cout << " var tensor desc dim " << l
+                        //                    << " value: " <<
+                        //                    tensor_desc.dims()[l] <<
+                        //                    std::endl;
+                    }
+                }
+
+                if (var.persistable() &&
+                    var.type().type() !=
+                        framework::proto::VarType::FEED_MINIBATCH &&
+                    var.type().type() !=
+                        framework::proto::VarType::FETCH_LIST) {
+                    //        std::cout << "  to load " << var.name() <<
+                    //        std::endl;
+                    std::string file_path = dirname + "/" + var.name();
+                    std::ifstream is(file_path);
+                    std::streampos pos =
+                        is.tellg(); //   save   current   position
+                    is.seekg(0, std::ios::end);
+                    //        std::cout << "  file length = " << is.tellg() <<
+                    //        std::endl;
+                    is.seekg(pos); //   restore   saved   position
+
+                    // 1. version
+                    uint32_t version;
+                    is.read(reinterpret_cast<char *>(&version),
+                            sizeof(version));
+                    //        std::cout << "   version: " << version <<
+                    //        std::endl;
+
+                    // 2 Lod information
+                    uint64_t lod_level;
+                    is.read(reinterpret_cast<char *>(&lod_level),
+                            sizeof(lod_level));
+                    //        std::cout << "   load level: " << lod_level <<
+                    //        std::endl;
+                    //        std::cout << "   lod info: " << std::endl;
+                    for (uint64_t i = 0; i < lod_level; ++i) {
+                        uint64_t size;
+                        is.read(reinterpret_cast<char *>(&size), sizeof(size));
+                        std::vector<size_t> tmp(size / sizeof(size_t));
+                        is.read(reinterpret_cast<char *>(tmp.data()),
+                                static_cast<std::streamsize>(size));
+                        for (int j = 0; j < tmp.size(); ++j) {
+                            //            std::cout << "    lod - " << tmp[j] <<
+                            //            std::endl;
+                        }
+                    }
+
+                    uint32_t tensor_version;
+                    is.read(reinterpret_cast<char *>(&version),
+                            sizeof(version));
+                    //        std::cout << "   tensor_version: " <<
+                    //        tensor_version <<
+                    //        std::endl;
+
+                    int32_t size;
+                    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+                    //        std::cout << "   tensor desc size: " << size <<
+                    //        std::endl;
+                    std::unique_ptr<char[]> buf(new char[size]);
+                    is.read(reinterpret_cast<char *>(buf.get()), size);
+
+                    framework::proto::VarType::TensorDesc desc;
+                    desc.ParseFromArray(buf.get(), size);
+
+                    //        std::cout << "   desc dims size " <<
+                    //        desc.dims().size() <<
+                    //        std::endl;
+                    int memory_size = 1;
+                    for (int l = 0; l < desc.dims().size(); ++l) {
+                        //          std::cout << "    dim " << l << " value: "
+                        //          <<
+                        //          desc.dims()[l]
+                        //                    << std::endl;
+                        memory_size *= desc.dims()[l];
+                    }
+
+                    int type_size = 0;
+                    //        std::cout << "    desc pre type: ";
+                    switch (desc.data_type()) {
+                    case framework::proto::VarType::FP16:
+                        //            std::cout << "FP16" << std::endl;
+                        type_size = 2;
+                        break;
+                    case framework::proto::VarType::FP32:
+                        type_size = 4;
+                        //            std::cout << "FP32" << std::endl;
+                        break;
+                    case framework::proto::VarType::FP64:
+                        type_size = 8;
+                        //            std::cout << "FP64" << std::endl;
+                        break;
+                    case framework::proto::VarType::INT32:
+                        type_size = 4;
+                        //            std::cout << "INT32" << std::endl;
+                        break;
+                    case framework::proto::VarType::INT64:
+                        type_size = 8;
+                        //            std::cout << "INT64" << std::endl;
+                        break;
+                    case framework::proto::VarType::BOOL:
+                        type_size = 1;
+                        //            std::cout << "BOOL" << std::endl;
+                        break;
+                    default:
+                        break;
+                        //            std::cout << "    not support" <<
+                        //            std::endl;
+                    }
+
+                    //        std::cout << "    malloc size: " << memory_size *
+                    //        type_size
+                    //                  << std::endl;
+                    void *memory = malloc(memory_size * type_size);
+                    is.read(static_cast<char *>(memory),
+                            memory_size * type_size);
+                    //        std::cout << "    memory: " << memory <<
+                    //        std::endl;
+                    is.close();
+                } else {
+                    //        std::cout << "  *not load "
+                    //                  << " var : " << var.name() << std::endl;
+                }
+            }
+        }

 #endif
-  return program;
-}
+        return program;
+    }

-template class Loader<CPU, Precision::FP32>;
+    template class Loader<CPU, Precision::FP32>;

 } // namespace paddle_mobile
--- a/src/io.h
+++ b/src/io.h
@@ -27,13 +27,14 @@ SOFTWARE.

 namespace paddle_mobile {

-template <typename Dtype, Precision P = Precision::FP32>
-class Loader : PaddleMobileObject {
-public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname);
+    template <typename Dtype, Precision P = Precision::FP32>
+    class Loader : PaddleMobileObject {
+      public:
+        const framework::Program<Dtype, P> Load(const std::string &dirname);

-private:
-  void LoadVar(framework::LoDTensor *tensor, const std::string &file_path);
-};
+      private:
+        void LoadVar(framework::LoDTensor *tensor,
+                     const std::string &file_path);
+    };

 } // namespace paddle_mobile
--- a/src/memory/t_malloc.cc
+++ b/src/memory/t_malloc.cc
@@ -22,30 +22,30 @@ SOFTWARE.
 #include <cstring>

 namespace paddle_mobile {
-namespace memory {
-const int MALLOC_ALIGN = 16;
+    namespace memory {
+        const int MALLOC_ALIGN = 16;

-void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
-};
+        void Copy(void *dst, const void *src, size_t num) {
+            std::memcpy(dst, src, num);
+        };

-void *Alloc(size_t size) {
-  size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
-  char *p = static_cast<char *>(malloc(offset + size));
-  if (!p) {
-    return nullptr;
-  }
-  void *r = reinterpret_cast<void *>(reinterpret_cast<size_t>(p + offset) &
-                                     (~(MALLOC_ALIGN - 1)));
-  static_cast<void **>(r)[-1] = p;
-  return r;
-}
+        void *Alloc(size_t size) {
+            size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
+            char *p = static_cast<char *>(malloc(offset + size));
+            if (!p) {
+                return nullptr;
+            }
+            void *r = reinterpret_cast<void *>(
+                reinterpret_cast<size_t>(p + offset) & (~(MALLOC_ALIGN - 1)));
+            static_cast<void **>(r)[-1] = p;
+            return r;
+        }

-void Free(void *ptr) {
-  if (ptr) {
-    free(static_cast<void **>(ptr)[-1]);
-  }
-}
+        void Free(void *ptr) {
+            if (ptr) {
+                free(static_cast<void **>(ptr)[-1]);
+            }
+        }

-} // namespace memory
+    } // namespace memory
 } // namespace paddle_mobile
--- a/src/memory/t_malloc.h
+++ b/src/memory/t_malloc.h
@@ -21,44 +21,44 @@ SOFTWARE.
 #include <type_traits>

 namespace paddle_mobile {
-namespace memory {
+    namespace memory {

-void Copy(void *dst, const void *src, size_t num);
+        void Copy(void *dst, const void *src, size_t num);

-void *Alloc(size_t size);
+        void *Alloc(size_t size);

-void Free(void *ptr);
+        void Free(void *ptr);

-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *          static_cast
- */
-template <typename T> class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
+        /**
+         * \brief   Free memory block in one place.
+         *
+         * \note    In some cases, custom deleter is used to
+         *          deallocate the memory automatically for
+         *          std::unique_ptr<T> in tensor.h.
+         *          static_cast
+         */
+        template <typename T> class PODDeleter {
+            static_assert(std::is_pod<T>::value, "T must be POD");

-public:
-  explicit PODDeleter(){};
+          public:
+            explicit PODDeleter(){};

-  void operator()(T *ptr) { Free(static_cast<void *>(ptr)); }
-};
+            void operator()(T *ptr) { Free(static_cast<void *>(ptr)); }
+        };

-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *          reinterpret_cast
- */
-template <typename T> class PlainDeleter {
-public:
-  explicit PlainDeleter(){};
+        /**
+         * \brief   Free memory block in one place does not meet POD
+         *
+         * \note    In some cases, custom deleter is used to
+         *          deallocate the memory automatically for
+         *          std::unique_ptr<T> in tensor.h.
+         *          reinterpret_cast
+         */
+        template <typename T> class PlainDeleter {
+          public:
+            explicit PlainDeleter(){};

-  void operator()(T *ptr) { Free(reinterpret_cast<void *>(ptr)); }
-};
-} // namespace memory
+            void operator()(T *ptr) { Free(reinterpret_cast<void *>(ptr)); }
+        };
+    } // namespace memory
 } // namespace paddle_mobile
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -22,55 +22,55 @@ SOFTWARE.
 #include "framework/operator.h"

 namespace paddle_mobile {
-namespace operators {
+    namespace operators {

-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
+        int ConvOutputSize(int input_size, int filter_size, int dilation,
+                           int padding, int stride) {
+            const int dkernel = dilation * (filter_size - 1) + 1;
+            int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+            return output_size;
+        }

-template <typename Dtype, typename T>
-void ConvOp<Dtype, T>::InferShape() const {
-  //  std::cout << " begin get dims: " << std::endl;
+        template <typename Dtype, typename T>
+        void ConvOp<Dtype, T>::InferShape() const {
+            //  std::cout << " begin get dims: " << std::endl;

-  auto in_dims = param_.Input()->dims();
+            auto in_dims = param_.Input()->dims();

-  //  std::cout << " end get in dims: " << std::endl;
+            //  std::cout << " end get in dims: " << std::endl;

-  //  std::cout << " in_dims: " << in_dims << std::endl;
+            //  std::cout << " in_dims: " << in_dims << std::endl;

-  //  std::cout << " begin get Filter " << std::endl;
+            //  std::cout << " begin get Filter " << std::endl;

-  auto filter_dims = param_.Filter()->dims();
+            auto filter_dims = param_.Filter()->dims();

-  //  std::cout << " end get Filter " << std::endl;
+            //  std::cout << " end get Filter " << std::endl;

-  //  std::cout << " begin get Attrs " << std::endl;
+            //  std::cout << " begin get Attrs " << std::endl;

-  const std::vector<int> &strides = param_.Strides();
+            const std::vector<int> &strides = param_.Strides();

-  //  std::cout << " end get Attrs " << strides[0] << std::endl;
+            //  std::cout << " end get Attrs " << strides[0] << std::endl;

-  std::vector<int> paddings = param_.Paddings();
+            std::vector<int> paddings = param_.Paddings();

-  int groups = param_.Groups();
+            int groups = param_.Groups();

-  std::vector<int> dilations = param_.Dilations();
+            std::vector<int> dilations = param_.Dilations();

-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
+            std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+            for (size_t i = 0; i < strides.size(); ++i) {
+                output_shape.push_back(
+                    ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                   dilations[i], paddings[i], strides[i]));
+            }

-  framework::DDim ddim = framework::make_ddim(output_shape);
-  param_.Output()->Resize(ddim);
-}
+            framework::DDim ddim = framework::make_ddim(output_shape);
+            param_.Output()->Resize(ddim);
+        }

-template class ConvOp<CPU, float>;
+        template class ConvOp<CPU, float>;

-} // namespace operators
+    } // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -22,32 +22,33 @@ SOFTWARE.
 #include "operators/kernel/conv_kernel.h"

 namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ConvOp : public framework::OperatorWithKernel<DeviceType> {
-public:
-  ConvOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
-  void InferShape() const override;
-
-protected:
-  void RunImpl() const {
-    operators::ConvKernel<DeviceType, T, ConvParam> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables();
-  }
-
-  ConvParam param_;
-};
-
-} // operators
+    namespace operators {
+
+        using namespace framework;
+
+        template <typename DeviceType, typename T>
+        class ConvOp : public framework::OperatorWithKernel<DeviceType> {
+          public:
+            ConvOp(const std::string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+                : framework::OperatorWithKernel<DeviceType>(
+                      type, inputs, outputs, attrs, scope),
+                  param_(inputs, outputs, attrs, *scope) {}
+
+            using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+            void InferShape() const override;
+
+            void Run() const {
+                operators::ConvKernel<DeviceType, T, ConvParam> kernel;
+                kernel.Compute(param_);
+                this->ClearVariables();
+            }
+
+          private:
+            ConvParam param_;
+        };
+
+    } // operators
 } // paddle_mobile
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+
+#include "elementwise_add_op.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <typename Dtype, typename T>
+        void ElementwiseAddOp<Dtype, T>::InferShape() const {
+            auto x_dim = param_.InputX()->dims();
+            param_.Out()->Resize(x_dim);
+        }
+        template class ElementwiseAddOp<CPU, float>;
+    }
+}
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+
+#include "framework/operator.h"
+#include "kernel/elementwise_add_kernel.h"
+#include "op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        using namespace framework;
+
+        template <typename DeviceType, typename T>
+        class ElementwiseAddOp
+            : public framework::OperatorWithKernel<DeviceType> {
+          public:
+            ElementwiseAddOp(const std::string &type,
+                             const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const framework::AttributeMap attrs,
+                             std::shared_ptr<framework::Scope> scope)
+                : framework::OperatorWithKernel<DeviceType>(
+                      type, inputs, outputs, attrs, scope),
+                  param_(inputs, outputs, attrs, *scope) {}
+
+            void Run() const {
+                operators::ElementwiseAddKernel<DeviceType, T,
+                                                ElementwiseAddParam>
+                    kernel;
+                kernel.Compute(param_);
+            }
+
+            using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+            void InferShape() const override;
+
+          protected:
+            ElementwiseAddParam param_;
+        };
+    }
+}
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -19,135 +19,146 @@ SOFTWARE.
 #include "operators/kernel/conv_kernel.h"

 namespace paddle_mobile {
-namespace operators {
-
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-template <>
-void ConvKernel<CPU, float, ConvParam>::Compute(const ConvParam &param) const {
-  const Tensor *input = param.Input();
-
-  LOG(kLOG_DEBUG) << param;
-
-  // The filter will be reshaped in the calculations,
-  // so here use an assignment operation,
-  // that avoids modifying the variable in the Scope.
-  Tensor filter = *param.Filter();
-
-  Tensor *output = param.Output();
-  //            output->mutable_data<T>(context.GetPlace());
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-  // use col_shape in the im2col calculation
-  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-  // o_h, o_w}
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  // use col_matrix_shape in the gemm calculation
-  // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
-  // o_h * o_w)
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  // col_matrix shares the same piece of data with col,
-  // but will be reshaped into a two-dimensional matrix shape
-  // to call the matrix multiplication interface.
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-
-  DLOG << " input dim " << input->dims();
-
-  DLOG << " output dim " << output->dims();
-
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  //            auto& dev_ctx = context.template
-  //            device_context<DeviceContext>();
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false, float(1.0),
-                          &out_slice, float(0.0));
-    }
-  }
-}
-
-template class ConvKernel<CPU, float, ConvParam>;
-
-} // namespace operators
+    namespace operators {
+
+        bool IsExpand(const std::vector<int64_t> &filter_dim,
+                      const std::vector<int> &strides,
+                      const std::vector<int> &paddings,
+                      const std::vector<int> &dilations) {
+            bool filter_1 = true, strides_1 = true, padding_0 = true,
+                 dilation_1 = true;
+            for (size_t j = 0; j < strides.size(); ++j) {
+                filter_1 =
+                    filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+                strides_1 = strides_1 && (strides[j] == 1);
+                padding_0 = padding_0 && (paddings[j] == 0);
+                dilation_1 = dilation_1 && (dilations[j] == 1);
+            }
+            return !(filter_1 && strides_1 && padding_0 && dilation_1);
+        }
+
+        template <>
+        void ConvKernel<CPU, float, ConvParam>::Compute(
+            const ConvParam &param) const {
+            LOG(kLOG_DEBUG) << param;
+
+            const Tensor *input = param.Input();
+
+            // The filter will be reshaped in the calculations,
+            // so here use an assignment operation,
+            // that avoids modifying the variable in the Scope.
+            Tensor filter = *param.Filter();
+
+            Tensor *output = param.Output();
+            //            output->mutable_data<T>(context.GetPlace());
+
+            int groups = param.Groups();
+            std::vector<int> strides = param.Strides();
+            std::vector<int> paddings = param.Paddings();
+            std::vector<int> dilations = param.Dilations();
+
+            DLOG << " compute end get Attrs " << strides[0];
+
+            const int batch_size = static_cast<int>(input->dims()[0]);
+
+            // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h,
+            // k_w}
+            std::vector<int64_t> filter_shape_vec(
+                framework::vectorize(filter.dims()));
+            // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h,
+            // o_w}
+            std::vector<int64_t> output_shape_vec(
+                framework::vectorize(output->dims()));
+
+            // use col_shape in the im2col calculation
+            // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h,
+            // k_w, o_d,
+            // o_h, o_w}
+            size_t data_dim = filter_shape_vec.size() - 2;
+            std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+            col_shape_vec[0] = input->dims()[1] / groups;
+            for (size_t j = 0; j < data_dim; ++j) {
+                col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+                col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+            }
+            framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+            // use col_matrix_shape in the gemm calculation
+            // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w,
+            // o_d *
+            // o_h * o_w)
+            framework::DDim col_matrix_shape =
+                framework::flatten_to_2d(col_shape, data_dim + 1);
+
+            bool is_expand =
+                IsExpand(filter_shape_vec, strides, paddings, dilations);
+            Tensor col;
+            // col_matrix shares the same piece of data with col,
+            // but will be reshaped into a two-dimensional matrix shape
+            // to call the matrix multiplication interface.
+            Tensor col_matrix;
+            if (is_expand) {
+                col.mutable_data<float>(col_shape);
+                col_matrix.ShareDataWith(col);
+                col_matrix.Resize(col_matrix_shape);
+            }
+
+            framework::DDim input_shape = framework::slice_ddim(
+                input->dims(), 1, static_cast<int>(input->dims().size()));
+
+            framework::DDim filter_matrix_shape = {
+                filter.dims()[0], filter.numel() / filter.dims()[0]};
+            filter.Resize(filter_matrix_shape);
+
+            framework::DDim output_matrix_shape = {
+                output->dims()[1],
+                output->numel() / (output->dims()[0] * output->dims()[1])};
+
+            // convolution operator: im2col(or vol2col) + gemm
+            int in_step = static_cast<int>(input->dims()[1]) / groups;
+            int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+            math::Vol2ColFunctor<CPU, float> vol2col;
+            math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+            //            auto& dev_ctx = context.template
+            //            device_context<DeviceContext>();
+            for (int i = 0; i < batch_size; i++) {
+                Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+                Tensor out_batch =
+                    output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+                for (int g = 0; g < groups; g++) {
+                    Tensor in_slice =
+                        in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+                    if (!is_expand) {
+                        col.ShareDataWith(in_slice);
+                        col_matrix.ShareDataWith(col);
+                        col_matrix.Resize(col_matrix_shape);
+                    } else if (data_dim == 2U) {
+                        // im2col
+                        im2col(in_slice, dilations, strides,
+                               std::vector<int>{paddings[0], paddings[1],
+                                                paddings[0], paddings[1]},
+                               &col);
+                    } else if (data_dim == 3U) {
+                        // vol2col
+                        vol2col(in_slice, dilations, strides, paddings, &col);
+                    }
+
+                    // gemm
+                    Tensor out_slice =
+                        out_batch.Slice(g * out_step, (g + 1) * out_step);
+                    Tensor filter_slice =
+                        filter.Slice(g * out_step, (g + 1) * out_step);
+                    math::matmul<float>(filter_slice, false, col_matrix, false,
+                                        float(1.0), &out_slice, float(0.0));
+                }
+            }
+        }
+
+        template class ConvKernel<CPU, float, ConvParam>;
+
+    } // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "operators/kernel/elementwise_add_kernel.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        template <typename T> struct AddFunctor {
+            inline T operator()(T a, T b) const { return a + b; }
+        };
+
+        template <>
+        void ElementwiseAddKernel<CPU, float, ElementwiseAddParam>::Compute(
+            const ElementwiseAddParam &param) const {
+            const Tensor *input_x = param.InputX();
+            const Tensor *input_y = param.InputY();
+            Tensor *Out = param.Out();
+            Out->mutable_data<float>();
+            const int axis = param.Axis();
+            ElementwiseComputeEx<AddFunctor<float>, float>(
+                input_x, input_y, axis, AddFunctor<float>(), Out);
+        }
+
+        template class ElementwiseAddKernel<CPU, float, ElementwiseAddParam>;
+
+    } // namespace operators
+} // namespace paddle
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -25,14 +25,15 @@ SOFTWARE.
 #pragma once;

 namespace paddle_mobile {
-namespace operators {
+    namespace operators {

-using namespace framework;
+        using namespace framework;

-template <typename DeviceType, typename T, typename P>
-class ConvKernel : public framework::OpKernelBase<DeviceType, ConvParam> {
-public:
-  void Compute(const ConvParam &param) const;
-};
-}
+        template <typename DeviceType, typename T, typename P>
+        class ConvKernel
+            : public framework::OpKernelBase<DeviceType, ConvParam> {
+          public:
+            void Compute(const ConvParam &param) const;
+        };
+    }
 }
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+#pragma once;
+
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+    namespace operators {
+
+        using namespace framework;
+
+        template <typename DeviceType, typename T, typename P>
+        class ElementwiseAddKernel
+            : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
+          public:
+            void Compute(const ElementwiseAddParam &param) const;
+        };
+    }
+}
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -19,11 +19,12 @@ SOFTWARE.
 #include "operators/kernel/conv_kernel.h"

 namespace paddle_mobile {
-namespace operators {
+    namespace operators {

-// template<>
-// void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
-//
-// template class ConvKernel<FPGA, float>;
-}
+        // template<>
+        // void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const
+        // {}
+        //
+        // template class ConvKernel<FPGA, float>;
+    }
 }
--- a/src/operators/math/elementwise_op_function.h
+++ b/src/operators/math/elementwise_op_function.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "transform.h"
+
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+namespace paddle_mobile {
+    namespace operators {
+
+        /*
+         * Out = X ⊙ Y
+         * If Y's shape does not match X' shape, they will be reshaped.
+         * For example:
+         * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+         *    pre=2, n=3*4, post=5
+         *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
+         * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+         *    pre=2*3, n=4*5, post=1
+         *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
+         */
+        inline void get_mid_dims(const framework::DDim &x_dims,
+                                 const framework::DDim &y_dims, const int axis,
+                                 int *pre, int *n, int *post) {
+            *pre = 1;
+            *n = 1;
+            *post = 1;
+            // compute pre
+            for (int i = 0; i < axis; ++i) {
+                (*pre) *= x_dims[i];
+            }
+
+            for (int i = 0; i < y_dims.size(); ++i) {
+                assert(x_dims[i + axis] == y_dims[i]);
+                /// "Broadcast dimension mismatch.");
+                (*n) *= y_dims[i];
+            }
+
+            for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+                (*post) *= x_dims[i];
+            }
+        }
+
+        /// remove dims tail 1. (4,20,1,1) -> (4,20)
+        inline void trim_trailing_singular_dims(framework::DDim *dims) {
+            // Remove trailing dimensions of size 1 for y
+            auto actual_dims_size = dims->size();
+            for (; actual_dims_size != 0; --actual_dims_size) {
+                if ((*dims)[actual_dims_size - 1] != 1)
+                    break;
+            }
+            if (actual_dims_size != dims->size()) {
+                auto actual_dims = framework::vectorize(*dims);
+                actual_dims.resize(actual_dims_size);
+                *dims = framework::make_ddim(actual_dims);
+            }
+        }
+
+        template <typename T> class RowwiseTransformIterator {
+          public:
+            RowwiseTransformIterator(const T *ptr, int n)
+                : ptr_(ptr), i_(0), n_(n) {}
+
+            RowwiseTransformIterator<T> &operator++() {
+                ++i_;
+                if (UNLIKELY(i_ == n_)) {
+                    i_ = 0;
+                }
+                return *this;
+            }
+
+            bool operator==(const RowwiseTransformIterator<T> &rhs) const {
+                return (ptr_ + i_) == &(*rhs);
+            }
+
+            bool operator!=(const RowwiseTransformIterator<T> &rhs) const {
+                return (ptr_ + i_) != &(*rhs);
+            }
+
+            const T &operator*() { return ptr_[i_]; }
+
+          private:
+            const T *ptr_;
+            int i_;
+            int64_t n_;
+        };
+
+        /// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last
+        /// dimension
+        /// in (4,20,2) is 2 ,
+        /// (20,1) move 1 stride , to fill(add) 2 element with the same number.
+        template <typename T> class MidWiseTransformIterator {
+          public:
+            MidWiseTransformIterator(const T *ptr, int n, int post)
+                : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+            MidWiseTransformIterator<T> &operator++() {
+                ++j_;
+                if (UNLIKELY(j_ == post_)) {
+                    ++i_;
+                    j_ = 0;
+                    if (UNLIKELY(i_ == n_)) {
+                        i_ = 0;
+                    }
+                }
+                return *this;
+            }
+
+            bool operator==(const MidWiseTransformIterator<T> &rhs) const {
+                return (ptr_ + i_) == &(*rhs);
+            }
+
+            bool operator!=(const MidWiseTransformIterator<T> &rhs) const {
+                return (ptr_ + i_) != &(*rhs);
+            }
+
+            const T &operator*() { return ptr_[i_]; }
+
+          private:
+            const T *ptr_;
+            int64_t i_;
+            int64_t j_;
+            int64_t n_;
+            int64_t post_;
+        };
+
+        template <typename Functor, typename T, typename OutType = T>
+        class TransformFunctor {
+          public:
+            TransformFunctor(const framework::Tensor *x,
+                             const framework::Tensor *y, framework::Tensor *z,
+                             Functor func)
+                : x_(x->data<T>()), y_(y->data<T>()),
+                  z_(z->mutable_data<OutType>()), nx_(x->numel()), func_(func) {
+            }
+
+            inline void Run() const {
+                math::Transform trans;
+                // 同时执行func(x_, y_)传入z_。
+                trans(x_, x_ + nx_, y_, z_, func_);
+            }
+
+            inline void RunRowWise(int n, int pre) const {
+                math::Transform trans;
+                trans(x_, x_ + nx_, RowwiseTransformIterator<T>(y_, n), z_,
+                      func_);
+            }
+
+            inline void RunMidWise(int n, int pre, int post) const {
+                math::Transform trans;
+                trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post),
+                      z_, func_);
+            }
+
+          private:
+            const T *x_;
+            const T *y_;
+            OutType *z_;
+            int64_t nx_;
+            Functor func_;
+        };
+
+        template <typename Functor, typename T, typename OutType = T>
+        void ElementwiseComputeEx(const framework::Tensor *x,
+                                  const framework::Tensor *y, int axis,
+                                  Functor func, framework::Tensor *z) {
+            TransformFunctor<Functor, T, OutType> functor(x, y, z, func);
+
+            auto x_dims = x->dims();
+            auto y_dims = y->dims();
+            // PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+            //                  "Rank of first input must >= rank of second
+            //                  input.");
+
+            if (x_dims == y_dims) {
+                functor.Run();
+                return;
+            }
+
+            /// axis = -1 represent the last dimension.
+            axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+            // PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+            //               "Axis should be in range [0, x_dims)");
+            trim_trailing_singular_dims(&y_dims);
+            axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+            int pre, n, post;
+            get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+            if (post == 1) {
+                functor.RunRowWise(n, pre);
+                return;
+            } else {
+                functor.RunMidWise(n, pre, post);
+                return;
+            }
+        }
+
+    } // namespace operators
+} // namespace paddle
--- a/src/operators/math/im2col.cc
+++ b/src/operators/math/im2col.cc
@@ -16,275 +16,349 @@ limitations under the License. */
 #include "common/types.h"

 namespace paddle_mobile {
-namespace operators {
-namespace math {
+    namespace operators {
+        namespace math {

-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T> class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
-public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
+            /*
+             * im = [input_channels, input_height, input_width]
+             * col =
+             *   [input_channels, filter_height, filter_width, output_height,
+             * output_width]
+             */
+            template <class T> class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
+              public:
+                void operator()(const framework::Tensor &im,
+                                const std::vector<int> &dilation,
+                                const std::vector<int> &stride,
+                                const std::vector<int> &padding,
+                                framework::Tensor *col) {
+                    //    PADDLE_ENFORCE(im.dims().size() == 3);
+                    //    PADDLE_ENFORCE(col->dims().size() == 5);

-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
+                    int im_channels = im.dims()[0];
+                    int im_height = im.dims()[1];
+                    int im_width = im.dims()[2];
+                    int filter_height = col->dims()[1];
+                    int filter_width = col->dims()[2];
+                    int col_height = col->dims()[3];
+                    int col_width = col->dims()[4];

-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-    //                       ((dilation[0] * (filter_height - 1) + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-    //                       ((dilation[1] * (filter_width - 1) + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
+                    //    -
+                    //                       ((dilation[0] * (filter_height - 1)
+                    //                       + 1))) /
+                    //                              stride[0] +
+                    //                          1,
+                    //                      col_height,
+                    //                      "Output_height and
+                    //                      padding(padding_up, padding_down)
+                    //                      are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
+                    //    -
+                    //                       ((dilation[1] * (filter_width - 1)
+                    //                       + 1))) /
+                    //                              stride[1] +
+                    //                          1,
+                    //                      col_width,
+                    //                      "Output_height and
+                    //                      padding(padding_up, padding_down)
+                    //                      are " "inconsistent.");

-    int channels_col = im_channels * filter_height * filter_width;
+                    int channels_col =
+                        im_channels * filter_height * filter_width;

-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+                    const T *im_data = im.data<T>();
+                    T *col_data = col->data<T>();
+                    for (int c = 0; c < channels_col; ++c) {
+                        int w_offset = c % filter_width;
+                        int h_offset = (c / filter_width) % filter_height;
+                        int c_im = c / (filter_width * filter_height);
+                        for (int h = 0; h < col_height; ++h) {
+                            int im_row_idx = h * stride[0] - padding[0] +
+                                             h_offset * dilation[0];
+                            for (int w = 0; w < col_width; ++w) {
+                                int im_col_idx = w * stride[1] - padding[1] +
+                                                 w_offset * dilation[1];
+                                int col_idx =
+                                    (c * col_height + h) * col_width + w;
+                                int im_idx =
+                                    (im_row_idx + c_im * im_height) * im_width +
+                                    im_col_idx;

-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
-      }
-    }
-  }
-};
+                                col_data[col_idx] =
+                                    (im_row_idx < 0 ||
+                                     im_row_idx >= im_height ||
+                                     im_col_idx < 0 || im_col_idx >= im_width)
+                                        ? static_cast<T>(0)
+                                        : im_data[im_idx];
+                            }
+                        }
+                    }
+                }
+            };

-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T> class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
-public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im) {
-    //    PADDLE_ENFORCE(im->dims().size() == 3);
-    //    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
+            /*
+             * im = [input_channels, input_height, input_width]
+             * col =
+             *   [input_channels, filter_height, filter_width, output_height,
+             * output_width]
+             */
+            template <class T> class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
+              public:
+                void operator()(const framework::Tensor &col,
+                                const std::vector<int> &dilation,
+                                const std::vector<int> &stride,
+                                const std::vector<int> &padding,
+                                framework::Tensor *im) {
+                    //    PADDLE_ENFORCE(im->dims().size() == 3);
+                    //    PADDLE_ENFORCE(col.dims().size() == 5);
+                    int im_channels = im->dims()[0];
+                    int im_height = im->dims()[1];
+                    int im_width = im->dims()[2];
+                    int filter_height = col.dims()[1];
+                    int filter_width = col.dims()[2];
+                    int col_height = col.dims()[3];
+                    int col_width = col.dims()[4];

-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-    //                       ((dilation[0] * (filter_height - 1) + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-    //                       ((dilation[1] * (filter_width - 1) + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
+                    //    -
+                    //                       ((dilation[0] * (filter_height - 1)
+                    //                       + 1))) /
+                    //                              stride[0] +
+                    //                          1,
+                    //                      col_height,
+                    //                      "Output_height and
+                    //                      padding(padding_up, padding_down)
+                    //                      are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
+                    //    -
+                    //                       ((dilation[1] * (filter_width - 1)
+                    //                       + 1))) /
+                    //                              stride[1] +
+                    //                          1,
+                    //                      col_width,
+                    //                      "Output_height and
+                    //                      padding(padding_up, padding_down)
+                    //                      are " "inconsistent.");

-    int channels_col = im_channels * filter_height * filter_width;
+                    int channels_col =
+                        im_channels * filter_height * filter_width;

-    T *im_data = im->data<T>();
-    const T *col_data = col.data<T>();
+                    T *im_data = im->data<T>();
+                    const T *col_data = col.data<T>();

-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
-              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
-                col_data[(c * col_height + h) * col_width + w];
-          }
-        }
-      }
-    }
-  }
-};
+                    for (int c = 0; c < channels_col; ++c) {
+                        int w_offset = c % filter_width;
+                        int h_offset = (c / filter_width) % filter_height;
+                        int c_im = c / (filter_width * filter_height);
+                        for (int h = 0; h < col_height; ++h) {
+                            int im_row_idx = h * stride[0] - padding[0] +
+                                             h_offset * dilation[0];
+                            for (int w = 0; w < col_width; ++w) {
+                                int im_col_idx = w * stride[1] - padding[1] +
+                                                 w_offset * dilation[1];
+                                if ((im_row_idx) >= 0 &&
+                                    (im_row_idx) < im_height &&
+                                    (im_col_idx) >= 0 &&
+                                    (im_col_idx) < im_width) {
+                                    im_data[(im_row_idx + c_im * im_height) *
+                                                im_width +
+                                            im_col_idx] +=
+                                        col_data[(c * col_height + h) *
+                                                     col_width +
+                                                 w];
+                                }
+                            }
+                        }
+                    }
+                }
+            };

-template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
-template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
-template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;
+            template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
+            template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
+            template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
+            template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;

-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T> class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
-public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
+            /*
+             * im = [input_channels, input_height, input_width]
+             * col =
+             *   [output_height, output_width, input_channels, filter_height,
+             * filter_width]
+             */
+            template <class T> class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
+              public:
+                void operator()(const framework::Tensor &im,
+                                const std::vector<int> &dilation,
+                                const std::vector<int> &stride,
+                                const std::vector<int> &padding,
+                                framework::Tensor *col) {
+                    //    PADDLE_ENFORCE(im.dims().size() == 3);
+                    //    PADDLE_ENFORCE(col->dims().size() == 5);
+                    int im_channels = im.dims()[0];
+                    int im_height = im.dims()[1];
+                    int im_width = im.dims()[2];
+                    int filter_height = col->dims()[3];
+                    int filter_width = col->dims()[4];
+                    int col_height = col->dims()[0];
+                    int col_width = col->dims()[1];

-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] - filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] - filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left, padding_right)
-    //        are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ(
+                    //        (im_height + padding[0] + padding[2] -
+                    //        filter_height) / stride[0]
+                    //        + 1, col_height, "Output_height and
+                    //        padding(padding_up,
+                    //        padding_down) are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ(
+                    //        (im_width + padding[1] + padding[3] -
+                    //        filter_width) / stride[1] +
+                    //        1, col_width, "col_width and padding(padding_left,
+                    //        padding_right)
+                    //        are " "inconsistent.");

-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
+                    const T *im_data = im.data<T>();
+                    T *col_data = col->data<T>();

-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+                    for (int col_row_idx = 0; col_row_idx < col_height;
+                         ++col_row_idx) {
+                        for (int col_col_idx = 0; col_col_idx < col_width;
+                             ++col_col_idx) {
+                            for (int channel = 0; channel < im_channels;
+                                 ++channel) {
+                                for (int filter_row_idx = 0;
+                                     filter_row_idx < filter_height;
+                                     ++filter_row_idx) {
+                                    int im_row_offset =
+                                        col_row_idx * stride[0] +
+                                        filter_row_idx - padding[0];
+                                    for (int filter_col_idx = 0;
+                                         filter_col_idx < filter_width;
+                                         ++filter_col_idx) {
+                                        int im_col_offset =
+                                            col_col_idx * stride[1] +
+                                            filter_col_idx - padding[1];

-              int col_offset =
-                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
+                                        int col_offset =
+                                            ((((col_row_idx)*col_width +
+                                               col_col_idx) *
+                                                  im_channels +
+                                              channel) *
+                                                 filter_height +
+                                             filter_row_idx) *
+                                                filter_width +
+                                            filter_col_idx;

-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 || im_row_offset >= im_height ||
-                   im_col_offset < 0 || im_col_offset >= im_width)
-                      ? static_cast<T>(0)
-                      : im_data[im_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
+                                        int im_offset = (channel * im_height +
+                                                         im_row_offset) *
+                                                            im_width +
+                                                        im_col_offset;
+                                        col_data[col_offset] =
+                                            (im_row_offset < 0 ||
+                                             im_row_offset >= im_height ||
+                                             im_col_offset < 0 ||
+                                             im_col_offset >= im_width)
+                                                ? static_cast<T>(0)
+                                                : im_data[im_offset];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            };

-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T> class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
-public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im) {
-    //    PADDLE_ENFORCE(im->dims().size() == 3);
-    //    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
+            /*
+             * im = [input_channels, input_height, input_width]
+             * col =
+             *   [output_height, output_width, input_channels, filter_height,
+             * filter_width]
+             */
+            template <class T> class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
+              public:
+                void operator()(const framework::Tensor &col,
+                                const std::vector<int> &dilation,
+                                const std::vector<int> &stride,
+                                const std::vector<int> &padding,
+                                framework::Tensor *im) {
+                    //    PADDLE_ENFORCE(im->dims().size() == 3);
+                    //    PADDLE_ENFORCE(col.dims().size() == 5);
+                    int im_channels = im->dims()[0];
+                    int im_height = im->dims()[1];
+                    int im_width = im->dims()[2];
+                    int filter_height = col.dims()[3];
+                    int filter_width = col.dims()[4];
+                    int col_height = col.dims()[0];
+                    int col_width = col.dims()[1];

-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] - filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] - filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left, padding_right)
-    //        are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ(
+                    //        (im_height + padding[0] + padding[2] -
+                    //        filter_height) / stride[0]
+                    //        + 1, col_height, "Output_height and
+                    //        padding(padding_up,
+                    //        padding_down) are " "inconsistent.");
+                    //    PADDLE_ENFORCE_EQ(
+                    //        (im_width + padding[1] + padding[3] -
+                    //        filter_width) / stride[1] +
+                    //        1, col_width, "col_width and padding(padding_left,
+                    //        padding_right)
+                    //        are " "inconsistent.");

-    T *im_data = im->data<T>();
-    const T *col_data = col.data<T>();
+                    T *im_data = im->data<T>();
+                    const T *col_data = col.data<T>();

-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+                    for (int col_row_idx = 0; col_row_idx < col_height;
+                         ++col_row_idx) {
+                        for (int col_col_idx = 0; col_col_idx < col_width;
+                             ++col_col_idx) {
+                            for (int channel = 0; channel < im_channels;
+                                 ++channel) {
+                                for (int filter_row_idx = 0;
+                                     filter_row_idx < filter_height;
+                                     ++filter_row_idx) {
+                                    int im_row_offset =
+                                        col_row_idx * stride[0] +
+                                        filter_row_idx - padding[0];
+                                    for (int filter_col_idx = 0;
+                                         filter_col_idx < filter_width;
+                                         ++filter_col_idx) {
+                                        int im_col_offset =
+                                            col_col_idx * stride[1] +
+                                            filter_col_idx - padding[1];

-              int col_offset =
-                  (((col_row_idx * col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
+                                        int col_offset =
+                                            (((col_row_idx * col_width +
+                                               col_col_idx) *
+                                                  im_channels +
+                                              channel) *
+                                                 filter_height +
+                                             filter_row_idx) *
+                                                filter_width +
+                                            filter_col_idx;

-              if (im_row_offset >= 0 && im_row_offset < im_height &&
-                  im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height + im_row_offset) * im_width +
-                    im_col_offset;
-                im_data[im_offset] += col_data[col_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
+                                        if (im_row_offset >= 0 &&
+                                            im_row_offset < im_height &&
+                                            im_col_offset >= 0 &&
+                                            im_col_offset < im_width) {
+                                            int im_offset =
+                                                (channel * im_height +
+                                                 im_row_offset) *
+                                                    im_width +
+                                                im_col_offset;
+                                            im_data[im_offset] +=
+                                                col_data[col_offset];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            };

-template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
-template class Im2ColFunctor<ColFormat::kOCF, CPU, double>;
-template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
-template class Col2ImFunctor<ColFormat::kOCF, CPU, double>;
+            template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
+            template class Im2ColFunctor<ColFormat::kOCF, CPU, double>;
+            template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
+            template class Col2ImFunctor<ColFormat::kOCF, CPU, double>;

-} // namespace math
-} // namespace operators
+        } // namespace math
+    }     // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/im2col.h
+++ b/src/operators/math/im2col.h
@@ -17,83 +17,96 @@ limitations under the License. */
 #include "framework/tensor.h"

 namespace paddle_mobile {
-namespace operators {
-namespace math {
+    namespace operators {
+        namespace math {

-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum class ColFormat { kCFO = 0, kOCF = 1 };
+            /* The storage format of the coldata in the Im2ColFunctor and
+             * Col2ImFunctor. */
+            enum class ColFormat { kCFO = 0, kOCF = 1 };

-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [input_channels, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 2-dimension  [dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 2-dimension  [stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [input_channels, filter_height, filter_width, output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_height * filter_width, and the width is equal
- * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_height,
- *      output_width]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [output_height, output_width, input_channels, filter_height, filter_width]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seq_length, step_size], where the seq_length
- * is equal output_height * output_width, and the step_size is equal
- * input_channels * filter_height * filter_width.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [output_height,
- *      output_width,
- *      input_channels,    ======>    [seqLength, stepSize]
- *      filter_height,
- *      filter_width]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, typename DeviceType, typename T>
-class Im2ColFunctor {
-public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col);
-};
+            /*
+             * \brief Converts the image data of three dimensions(CHW) into a
+             * colData of
+             *        five dimensions in the Im2ColFunctor calculation,
+             *        And in the Col2ImFunctor calculation, it is reversed.
+             *
+             * \param imData   Image data.
+             * \param imShape  The shape of imData,
+             *                 [input_channels, input_height, input_width].
+             * \param colData  Column data.
+             * \param colShape The shape of colData.
+             *
+             * \param dilations    dilation data.
+             * \param 2-dimension  [dilation_height, dilation_width].
+             *
+             * \param strides      stride data.
+             * \param 2-dimension  [stride_height, stride_width].
+             *
+             * \param paddings     padding data.
+             * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+             *
+             * If the template argument Format is kCFO, the shape of colData is:
+             * [input_channels, filter_height, filter_width, output_height,
+             * output_width]
+             * So, it is easy to reshape into a convolution matrix for
+             * convolution
+             * calculation based on matrix multiplication.
+             * The shape of convolution matrix is [height, width], where the
+             * height is equal
+             * input_channels * filter_height * filter_width, and the width is
+             * equal
+             * output_height * output_width.
+             *
+             * Reshape:
+             *     shape of colData           shape of convolution matrix
+             *     [input_channels,
+             *      filter_height,
+             *      filter_width,      ======>      [height, width]
+             *      output_height,
+             *      output_width]
+             *
+             * If the template argument Format is kOCF, the shape of colData is:
+             * [output_height, output_width, input_channels, filter_height,
+             * filter_width]
+             * So, it is easy to reshape into a sequence matrix for rnn
+             * calculation.
+             * The shape of sequence matrix is [seq_length, step_size], where
+             * the seq_length
+             * is equal output_height * output_width, and the step_size is equal
+             * input_channels * filter_height * filter_width.
+             *
+             * Reshape:
+             *     shape of colData             shape of sequence matrix
+             *     [output_height,
+             *      output_width,
+             *      input_channels,    ======>    [seqLength, stepSize]
+             *      filter_height,
+             *      filter_width]
+             *
+             * \note The caller needs to ensure that imShape.inputChannels is
+             * equal to
+             *       colShape.inputChannels.
+             */
+            template <ColFormat Format, typename DeviceType, typename T>
+            class Im2ColFunctor {
+              public:
+                void operator()(const framework::Tensor &im,
+                                const std::vector<int> &dilation,
+                                const std::vector<int> &stride,
+                                const std::vector<int> &padding,
+                                framework::Tensor *col);
+            };

-template <ColFormat Format, typename DeviceType, typename T>
-class Col2ImFunctor {
-public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im);
-};
+            template <ColFormat Format, typename DeviceType, typename T>
+            class Col2ImFunctor {
+              public:
+                void operator()(const framework::Tensor &col,
+                                const std::vector<int> &dilation,
+                                const std::vector<int> &stride,
+                                const std::vector<int> &padding,
+                                framework::Tensor *im);
+            };

-} // namespace math
-} // namespace operators
+        } // namespace math
+    }     // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/math_function.cc
+++ b/src/operators/math/math_function.cc
@@ -15,106 +15,125 @@ limitations under the License. */
 #include "math_function.h"

 namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                 const int M, const int N, const int K, const float alpha,
-                 const float *A, const float *B, const float beta, float *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K, const double alpha,
-                  const double *A, const double *B, const double beta,
-                  double *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<float>(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void gemm<double>(const bool transA, const bool transB, const int M,
-                  const int N, const int K, const double alpha, const double *A,
-                  const int lda, const double *B, const int ldb,
-                  const double beta, double *C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
-                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place()) &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<float>(transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-              matrix_b.data<float>(), beta, matrix_out->data<float>());
-}
-
-template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place()) &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<double>(transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-               matrix_b.data<double>(), beta, matrix_out->data<double>());
-}
-
-} // namespace math
-} // namespace operators
+    namespace operators {
+        namespace math {
+
+            template <>
+            void gemm<float>(const CBLAS_TRANSPOSE transA,
+                             const CBLAS_TRANSPOSE transB, const int M,
+                             const int N, const int K, const float alpha,
+                             const float *A, const float *B, const float beta,
+                             float *C) {
+                int lda = (transA == CblasNoTrans) ? K : M;
+                int ldb = (transB == CblasNoTrans) ? N : K;
+                int ldc = N;
+                cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A,
+                            lda, B, ldb, beta, C, ldc);
+            }
+
+            template <>
+            void gemm<double>(const CBLAS_TRANSPOSE transA,
+                              const CBLAS_TRANSPOSE transB, const int M,
+                              const int N, const int K, const double alpha,
+                              const double *A, const double *B,
+                              const double beta, double *C) {
+                int lda = (transA == CblasNoTrans) ? K : M;
+                int ldb = (transB == CblasNoTrans) ? N : K;
+                int ldc = N;
+                cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A,
+                            lda, B, ldb, beta, C, ldc);
+            }
+
+            template <>
+            void gemm<float>(const bool transA, const bool transB, const int M,
+                             const int N, const int K, const float alpha,
+                             const float *A, const int lda, const float *B,
+                             const int ldb, const float beta, float *C,
+                             const int ldc) {
+                cblas_sgemm(CblasRowMajor,
+                            transA == false ? CblasNoTrans : CblasTrans,
+                            transB == false ? CblasNoTrans : CblasTrans, M, N,
+                            K, alpha, A, lda, B, ldb, beta, C, ldc);
+            }
+
+            template <>
+            void gemm<double>(const bool transA, const bool transB, const int M,
+                              const int N, const int K, const double alpha,
+                              const double *A, const int lda, const double *B,
+                              const int ldb, const double beta, double *C,
+                              const int ldc) {
+                cblas_dgemm(CblasRowMajor,
+                            transA == false ? CblasNoTrans : CblasTrans,
+                            transB == false ? CblasNoTrans : CblasTrans, M, N,
+                            K, alpha, A, lda, B, ldb, beta, C, ldc);
+            }
+
+            template <>
+            void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
+                               const framework::Tensor &matrix_b, bool trans_b,
+                               float alpha, framework::Tensor *matrix_out,
+                               float beta) {
+                auto dim_a = matrix_a.dims();
+                auto dim_b = matrix_b.dims();
+                auto dim_out = matrix_out->dims();
+                //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+                //  dim_out.size() ==
+                //  2,
+                //                 "The input and output of matmul be matrix");
+                //
+                //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                //                     platform::is_cpu_place(matrix_b.place())
+                //                     &&
+                //                     platform::is_cpu_place(matrix_out->place()),
+                //                 "Matrix must all be in CPUPlace");
+
+                int M = dim_out[0];
+                int N = dim_out[1];
+                int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+                CBLAS_TRANSPOSE transA =
+                    (trans_a == false) ? CblasNoTrans : CblasTrans;
+                CBLAS_TRANSPOSE transB =
+                    (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+                gemm<float>(transA, transB, M, N, K, alpha,
+                            matrix_a.data<float>(), matrix_b.data<float>(),
+                            beta, matrix_out->data<float>());
+            }
+
+            template <>
+            void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
+                                const framework::Tensor &matrix_b, bool trans_b,
+                                double alpha, framework::Tensor *matrix_out,
+                                double beta) {
+                auto dim_a = matrix_a.dims();
+                auto dim_b = matrix_b.dims();
+                auto dim_out = matrix_out->dims();
+                //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+                //  dim_out.size() ==
+                //  2,
+                //                 "The input and output of matmul be matrix");
+                //
+                //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                //                     platform::is_cpu_place(matrix_b.place())
+                //                     &&
+                //                     platform::is_cpu_place(matrix_out->place()),
+                //                 "Matrix must all be in CPUPlace");
+
+                int M = dim_out[0];
+                int N = dim_out[1];
+                int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+                CBLAS_TRANSPOSE transA =
+                    (trans_a == false) ? CblasNoTrans : CblasTrans;
+                CBLAS_TRANSPOSE transB =
+                    (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+                gemm<double>(transA, transB, M, N, K, alpha,
+                             matrix_a.data<double>(), matrix_b.data<double>(),
+                             beta, matrix_out->data<double>());
+            }
+
+        } // namespace math
+    }     // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -19,24 +19,26 @@ limitations under the License. */
 #include <cmath>

 namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-          const int M, const int N, const int K, const T alpha, const T *A,
-          const T *B, const T beta, T *C);
-
-template <typename T>
-void gemm(const bool transA, const bool transB, const int M, const int N,
-          const int K, const T alpha, const T *A, const int lda, const T *B,
-          const int ldb, const T beta, T *C, const int ldc);
-
-// matrix multiply with continuous memory
-template <typename T>
-void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta);
-} // namespace math
-} // namespace operators
+    namespace operators {
+        namespace math {
+
+            template <typename T>
+            void gemm(const CBLAS_TRANSPOSE transA,
+                      const CBLAS_TRANSPOSE transB, const int M, const int N,
+                      const int K, const T alpha, const T *A, const T *B,
+                      const T beta, T *C);
+
+            template <typename T>
+            void gemm(const bool transA, const bool transB, const int M,
+                      const int N, const int K, const T alpha, const T *A,
+                      const int lda, const T *B, const int ldb, const T beta,
+                      T *C, const int ldc);
+
+            // matrix multiply with continuous memory
+            template <typename T>
+            void matmul(const framework::Tensor &matrix_a, bool trans_a,
+                        const framework::Tensor &matrix_b, bool trans_b,
+                        T alpha, framework::Tensor *matrix_out, T beta);
+        } // namespace math
+    }     // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/transform.h
+++ b/src/operators/math/transform.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+
+namespace paddle_mobile {
+    namespace operators {
+        namespace math {
+
+            // Transform applys a unary or a binary functor on each element in a
+            // range defined by a pair of iterators.
+            //
+            // - The specialization for CPU calls std::transform.
+            // - The specialization for CUDA calls thrust::tranform.
+            //
+            // NOTE: We need to define InputIter and OutputIter defined as
+            //       different types, because the InputIter points op's inputs
+            //       and
+            //       OutputIter pints to op's outputs.
+            //
+            // NOTE: We don't assume that InputIter to be const InputType* and
+            //       OutputIter to be OutputType*, because we might use a
+            //       iterator
+            //       class, paddle::fluid::operators::RowwiseTRansformIterator.
+
+            struct Transform {
+                template <typename InputIter, typename OutputIter,
+                          typename UnaryOperation>
+                void operator()(InputIter first, InputIter last,
+                                OutputIter result, UnaryOperation op) {
+                    std::transform(first, last, result, op);
+                }
+
+                template <typename InputIter1, typename InputIter2,
+                          typename OutputIter, typename BinaryOperation>
+                void operator()(InputIter1 first1, InputIter1 last1,
+                                InputIter2 first2, OutputIter result,
+                                BinaryOperation op) {
+                    std::transform(first1, last1, first2, result, op);
+                }
+            };
+        }
+    } // namespace platform
+} // namespace paddle
--- a/src/operators/math/vol2col.cc
+++ b/src/operators/math/vol2col.cc
@@ -15,179 +15,212 @@ limitations under the License. */
 #include "vol2col.h"

 namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-using Tensor = paddle_mobile::framework::Tensor;
-/*
- * vol = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <typename T> class Vol2ColFunctor<CPU, T> {
-public:
-  void operator()(const Tensor &vol, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *col) const {
-    //    PADDLE_ENFORCE(vol.dims().size() == 4);
-    //    PADDLE_ENFORCE(col->dims().size() == 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1) + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height - 1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1) + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
-
-    const T *vol_data = vol.data<T>();
-    T *col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                    input_width +
-                w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * vol = [input_channels,input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <typename T> class Col2VolFunctor<CPU, T> {
-public:
-  void operator()(const Tensor &col, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *vol) const {
-    //    PADDLE_ENFORCE(vol->dims().size() == 4);
-    //    PADDLE_ENFORCE(col.dims().size() == 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1) + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height - 1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1) + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
-    T *vol_data = vol->data<T>();
-    const T *col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Vol2ColFunctor<CPU, float>;
-template class Vol2ColFunctor<CPU, double>;
-template class Col2VolFunctor<CPU, float>;
-template class Col2VolFunctor<CPU, double>;
-
-} // namespace math
-} // namespace operators
+    namespace operators {
+        namespace math {
+
+            using Tensor = paddle_mobile::framework::Tensor;
+            /*
+             * vol = [input_channels, input_depth, input_height, input_width]
+             * col =
+             *   [input_channels, filter_depth, filter_height, filter_width,
+             *                    output_depth, output_height, output_width]
+             */
+            template <typename T> class Vol2ColFunctor<CPU, T> {
+              public:
+                void operator()(const Tensor &vol,
+                                const std::vector<int> &dilations,
+                                const std::vector<int> &strides,
+                                const std::vector<int> &paddings,
+                                Tensor *col) const {
+                    //    PADDLE_ENFORCE(vol.dims().size() == 4);
+                    //    PADDLE_ENFORCE(col->dims().size() == 7);
+
+                    int input_channels = vol.dims()[0];
+                    int input_depth = vol.dims()[1];
+                    int input_height = vol.dims()[2];
+                    int input_width = vol.dims()[3];
+                    int filter_depth = col->dims()[1];
+                    int filter_height = col->dims()[2];
+                    int filter_width = col->dims()[3];
+                    int output_depth = col->dims()[4];
+                    int output_height = col->dims()[5];
+                    int output_width = col->dims()[6];
+                    int channels_col = input_channels * filter_depth *
+                                       filter_height * filter_width;
+
+                    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                    //                       ((dilations[0] * (filter_depth - 1)
+                    //                       + 1))) /
+                    //                              strides[0] +
+                    //                          1,
+                    //                      output_depth,
+                    //                      "input_depth and output_depth are "
+                    //                      "mismatching.");
+                    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                    //                       ((dilations[1] * (filter_height -
+                    //                       1) + 1))) /
+                    //                              strides[1] +
+                    //                          1,
+                    //                      output_height,
+                    //                      "input_height and output_height are
+                    //                      "
+                    //                      "mismatching.");
+                    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                    //                       ((dilations[2] * (filter_width - 1)
+                    //                       + 1))) /
+                    //                              strides[2] +
+                    //                          1,
+                    //                      output_width,
+                    //                      "input_width and output_width are "
+                    //                      "mismatching.");
+
+                    const T *vol_data = vol.data<T>();
+                    T *col_data = col->data<T>();
+
+                    for (int c = 0; c < channels_col; ++c) {
+                        int w_offset = c % filter_width;
+                        int h_offset = (c / filter_width) % filter_height;
+                        int d_offset =
+                            (c / filter_width / filter_height) % filter_depth;
+                        int c_in =
+                            c / filter_width / filter_height / filter_depth;
+                        for (int d = 0; d < output_depth; ++d) {
+                            int d_pad = d * strides[0] - paddings[0] +
+                                        d_offset * dilations[0];
+                            for (int h = 0; h < output_height; ++h) {
+                                int h_pad = h * strides[1] - paddings[1] +
+                                            h_offset * dilations[1];
+                                for (int w = 0; w < output_width; ++w) {
+                                    int w_pad = w * strides[2] - paddings[2] +
+                                                w_offset * dilations[2];
+
+                                    int col_idx = ((c * output_depth + d) *
+                                                       output_height +
+                                                   h) *
+                                                      output_width +
+                                                  w;
+                                    int vol_idx =
+                                        ((c_in * input_depth + d_pad) *
+                                             input_height +
+                                         h_pad) *
+                                            input_width +
+                                        w_pad;
+                                    col_data[col_idx] =
+                                        (h_pad < 0 || h_pad >= input_height ||
+                                         w_pad < 0 || w_pad >= input_width ||
+                                         d_pad < 0 || d_pad >= input_depth)
+                                            ? static_cast<T>(0)
+                                            : vol_data[vol_idx];
+                                }
+                            }
+                        }
+                    }
+                }
+            };
+
+            /*
+             * vol = [input_channels,input_depth, input_height, input_width]
+             * col =
+             *   [input_channels, filter_depth, filter_height, filter_width,
+             *                    output_depth, output_height, output_width]
+             */
+            template <typename T> class Col2VolFunctor<CPU, T> {
+              public:
+                void operator()(const Tensor &col,
+                                const std::vector<int> &dilations,
+                                const std::vector<int> &strides,
+                                const std::vector<int> &paddings,
+                                Tensor *vol) const {
+                    //    PADDLE_ENFORCE(vol->dims().size() == 4);
+                    //    PADDLE_ENFORCE(col.dims().size() == 7);
+
+                    int input_channels = vol->dims()[0];
+                    int input_depth = vol->dims()[1];
+                    int input_height = vol->dims()[2];
+                    int input_width = vol->dims()[3];
+                    int filter_depth = col.dims()[1];
+                    int filter_height = col.dims()[2];
+                    int filter_width = col.dims()[3];
+                    int output_depth = col.dims()[4];
+                    int output_height = col.dims()[5];
+                    int output_width = col.dims()[6];
+                    int channels_col = input_channels * filter_depth *
+                                       filter_height * filter_width;
+
+                    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                    //                       ((dilations[0] * (filter_depth - 1)
+                    //                       + 1))) /
+                    //                              strides[0] +
+                    //                          1,
+                    //                      output_depth,
+                    //                      "input_depth and output_depth are "
+                    //                      "mismatching.");
+                    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                    //                       ((dilations[1] * (filter_height -
+                    //                       1) + 1))) /
+                    //                              strides[1] +
+                    //                          1,
+                    //                      output_height,
+                    //                      "input_height and output_height are
+                    //                      "
+                    //                      "mismatching.");
+                    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                    //                       ((dilations[2] * (filter_width - 1)
+                    //                       + 1))) /
+                    //                              strides[2] +
+                    //                          1,
+                    //                      output_width,
+                    //                      "input_width and output_width are "
+                    //                      "mismatching.");
+                    T *vol_data = vol->data<T>();
+                    const T *col_data = col.data<T>();
+
+                    for (int c = 0; c < channels_col; ++c) {
+                        int w_offset = c % filter_width;
+                        int h_offset = (c / filter_width) % filter_height;
+                        int d_offset =
+                            (c / filter_width / filter_height) % filter_depth;
+                        int cIm =
+                            c / filter_width / filter_height / filter_depth;
+                        for (int d = 0; d < output_depth; ++d) {
+                            int d_pad = d * strides[0] - paddings[0] +
+                                        d_offset * dilations[0];
+                            for (int h = 0; h < output_height; ++h) {
+                                int h_pad = h * strides[1] - paddings[1] +
+                                            h_offset * dilations[1];
+                                for (int w = 0; w < output_width; ++w) {
+                                    int w_pad = w * strides[2] - paddings[2] +
+                                                w_offset * dilations[2];
+
+                                    if (h_pad >= 0 && h_pad < input_height &&
+                                        w_pad >= 0 && w_pad < input_width &&
+                                        d_pad >= 0 && d_pad < input_depth) {
+                                        int vol_idx =
+                                            ((cIm * input_depth + d_pad) *
+                                                 input_height +
+                                             h_pad) *
+                                                input_width +
+                                            w_pad;
+
+                                        int col_idx = ((c * output_depth + d) *
+                                                           output_height +
+                                                       h) *
+                                                          output_width +
+                                                      w;
+                                        vol_data[vol_idx] += col_data[col_idx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            };
+
+            template class Vol2ColFunctor<CPU, float>;
+            template class Vol2ColFunctor<CPU, double>;
+            template class Col2VolFunctor<CPU, float>;
+            template class Col2VolFunctor<CPU, double>;
+
+        } // namespace math
+    }     // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/vol2col.h
+++ b/src/operators/math/vol2col.h
@@ -18,66 +18,78 @@ limitations under the License. */
 #include "framework/tensor.h"

 namespace paddle_mobile {
-namespace operators {
-namespace math {
-/*
- * \brief Converts the feature data of four dimensions(CDHW) into a colData of
- *        seven dimensions in the Vol2ColFunctor calculation,
- *        And in the Col2VolFunctor calculation, it is reversed.
- *
- * \param volData   Vol data.
- * \param volShape  The shape of volData,
- *                 [input_channels, input_depth, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 3-dimension  [stride_depth, stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 3-dimension  [d_pad, h_pad, w_pad].
- *
- * The shape of colData is:
- * [input_channels, filter_depth, filter_height, filter_width, output_depth,
- * output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_depth * filter_height * filter_width, and the width
- * is equal output_depth * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_depth,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_depth,
- *      output_height,
- *      output_width]
- *
- * \note The caller needs to ensure that volShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-using Tensor = paddle_mobile::framework::Tensor;
+    namespace operators {
+        namespace math {
+            /*
+             * \brief Converts the feature data of four dimensions(CDHW) into a
+             * colData of
+             *        seven dimensions in the Vol2ColFunctor calculation,
+             *        And in the Col2VolFunctor calculation, it is reversed.
+             *
+             * \param volData   Vol data.
+             * \param volShape  The shape of volData,
+             *                 [input_channels, input_depth, input_height,
+             * input_width].
+             * \param colData  Column data.
+             * \param colShape The shape of colData.
+             *
+             * \param dilations    dilation data.
+             * \param 3-dimension  [dilation_depth, dilation_height,
+             * dilation_width].
+             *
+             * \param strides      stride data.
+             * \param 3-dimension  [stride_depth, stride_height, stride_width].
+             *
+             * \param paddings     padding data.
+             * \param 3-dimension  [d_pad, h_pad, w_pad].
+             *
+             * The shape of colData is:
+             * [input_channels, filter_depth, filter_height, filter_width,
+             * output_depth,
+             * output_height, output_width]
+             * So, it is easy to reshape into a convolution matrix for
+             * convolution
+             * calculation based on matrix multiplication.
+             * The shape of convolution matrix is [height, width], where the
+             * height is equal
+             * input_channels * filter_depth * filter_height * filter_width, and
+             * the width
+             * is equal output_depth * output_height * output_width.
+             *
+             * Reshape:
+             *     shape of colData           shape of convolution matrix
+             *     [input_channels,
+             *      filter_depth,
+             *      filter_height,
+             *      filter_width,      ======>      [height, width]
+             *      output_depth,
+             *      output_height,
+             *      output_width]
+             *
+             * \note The caller needs to ensure that volShape.inputChannels is
+             * equal to
+             *       colShape.inputChannels.
+             */
+            using Tensor = paddle_mobile::framework::Tensor;

-template <typename DeviceType, typename T> class Vol2ColFunctor {
-public:
-  void operator()(const Tensor &vol, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *col) const;
-};
+            template <typename DeviceType, typename T> class Vol2ColFunctor {
+              public:
+                void operator()(const Tensor &vol,
+                                const std::vector<int> &dilations,
+                                const std::vector<int> &strides,
+                                const std::vector<int> &paddings,
+                                Tensor *col) const;
+            };

-template <typename DeviceType, typename T> class Col2VolFunctor {
-public:
-  void operator()(const Tensor &col, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *vol) const;
-};
+            template <typename DeviceType, typename T> class Col2VolFunctor {
+              public:
+                void operator()(const Tensor &col,
+                                const std::vector<int> &dilations,
+                                const std::vector<int> &strides,
+                                const std::vector<int> &paddings,
+                                Tensor *vol) const;
+            };

-} // namespace math
-} // namespace operators
+        } // namespace math
+    }     // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
@@ -19,28 +19,27 @@ SOFTWARE.
 #include "op_param.h"

 namespace paddle_mobile {
-namespace operators {
-
-Print &operator<<(Print &printer, const ConvParam &conv_param) {
-  printer << "parameter of conv: "
-          << "\n";
-  printer << "  stride: "
-          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
-          << "\n";
-  printer << "  paddings: "
-          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
-          << ") "
-          << "\n";
-  printer << "  dilations: "
-          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
-          << ") "
-          << "\n";
-  printer << "  groups: " << conv_param.Groups() << "\n";
-  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
-  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
-  printer << "  output dims: " << conv_param.Output()->dims();
-  return printer;
-}
-
-} // namespace operators
+    namespace operators {
+        Print &operator<<(Print &printer, const ConvParam &conv_param) {
+            printer << "parameter of conv: "
+                    << "\n";
+            printer << "  stride: "
+                    << " (" << conv_param.Strides()[0]
+                    << conv_param.Strides()[1] << ") "
+                    << "\n";
+            printer << "  paddings: "
+                    << " (" << conv_param.Paddings()[0]
+                    << conv_param.Paddings()[1] << ") "
+                    << "\n";
+            printer << "  dilations: "
+                    << " (" << conv_param.Dilations()[0]
+                    << conv_param.Dilations()[1] << ") "
+                    << "\n";
+            printer << "  groups: " << conv_param.Groups() << "\n";
+            printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
+            printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
+            printer << "  output dims: " << conv_param.Output()->dims();
+            return printer;
+        }
+    } // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -26,86 +26,211 @@ SOFTWARE.
 #include "framework/variable.h"

 namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-class OpParam : PaddleMobileObject {
-public:
-protected:
-  template <typename T>
-  static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Input", inputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Output", outputs, scope);
-  }
-
-  template <typename T>
-  static T *FilterFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Filter", inputs, scope);
-  }
-
-  template <typename T>
-  static const T GetAttr(std::string key, const AttributeMap &map) {
-    return ((Attribute)map.at(key)).Get<T>();
-  }
-
-  template <typename T>
-  static T *GetVarValue(std::string key, const VariableNameMap &var_map,
-                        const Scope &scope) {
-    auto var_vec = var_map.at(key);
-    if (var_vec.size()) {
-      //      std::cout << " get var value -- " << var_vec[0] << std::endl;
-      auto var = scope.FindVar(var_vec[0]);
-      return var->GetMutable<T>();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-class ConvParam : OpParam {
-public:
-  ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
-    filter_ = FilterFrom<framework::LoDTensor>(inputs, scope);
-    input_ = InputFrom<framework::Tensor>(inputs, scope);
-    output_ = OutputFrom<framework::Tensor>(outputs, scope);
-    strides_ = GetAttr<std::vector<int>>("strides", attrs);
-    paddings_ = GetAttr<std::vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<std::vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-  }
-
-  const Tensor *Input() const { return input_; }
-
-  const LoDTensor *Filter() const { return filter_; }
-
-  Tensor *Output() const { return output_; }
-
-  const std::vector<int> &Strides() const { return strides_; }
-
-  const std::vector<int> &Paddings() const { return paddings_; }
-
-  const std::vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
-private:
-  Tensor *input_;
-  Tensor *output_;
-  LoDTensor *filter_;
-  std::vector<int> strides_;
-  std::vector<int> paddings_;
-  std::vector<int> dilations_;
-  int groups;
-};
-
-Print &operator<<(Print &printer, const ConvParam &conv_param);
-
-} // namespace operators
+    namespace operators {
+
+        using namespace framework;
+
+        class OpParam : PaddleMobileObject {
+          public:
+          protected:
+            template <typename T>
+            static T *InputFrom(const VariableNameMap &inputs,
+                                const Scope &scope) {
+                return GetVarValue<T>("Input", inputs, scope);
+            }
+
+            template <typename T>
+            static T *InputXFrom(const VariableNameMap &inputs,
+                                 const Scope &scope) {
+                return GetVarValue<T>("X", inputs, scope);
+            }
+
+            template <typename T>
+            static T *InputYFrom(const VariableNameMap &inputs,
+                                 const Scope &scope) {
+                return GetVarValue<T>("Y", inputs, scope);
+            }
+
+            template <typename T>
+            static std::vector<T *>
+            InputMultiFrom(const VariableNameMap &inputs, const Scope &scope) {
+                return GetMultiVarValue<T>("Input", inputs, scope);
+            }
+
+            template <typename T>
+            static T *OutputFrom(const VariableNameMap &outputs,
+                                 const Scope &scope) {
+                return GetVarValue<T>("Output", outputs, scope);
+            }
+
+            template <typename T>
+            static T *OutFrom(const VariableNameMap &outputs,
+                              const Scope &scope) {
+                return GetVarValue<T>("Out", outputs, scope);
+            }
+
+            template <typename T>
+            static T *FilterFrom(const VariableNameMap &inputs,
+                                 const Scope &scope) {
+                return GetVarValue<T>("Filter", inputs, scope);
+            }
+
+            template <typename T>
+            static const T GetAttr(std::string key, const AttributeMap &map) {
+                return ((Attribute)map.at(key)).Get<T>();
+            }
+
+            template <typename T>
+            static T *GetVarValue(std::string key,
+                                  const VariableNameMap &var_map,
+                                  const Scope &scope) {
+                auto var_vec = var_map.at(key);
+                if (var_vec.size()) {
+                    //      std::cout << " get var value -- " << var_vec[0] <<
+                    //      std::endl;
+                    auto var = scope.FindVar(var_vec[0]);
+                    return var->GetMutable<T>();
+                } else {
+                    return nullptr;
+                }
+            }
+
+            template <typename T>
+            static std::vector<T *>
+            GetMultiVarValue(std::string key, const VariableNameMap &var_map,
+                             const Scope &scope) {
+                auto var_vecs = var_map.at(key);
+                assert(var_vecs.size() > 1);
+                std::vector<T *> var_res;
+                for (auto &var_vec : var_vecs) {
+                    auto var = scope.FindVar(var_vec);
+                    var_res.push_back(var->GetMutable<T>());
+                }
+                return var_res;
+            }
+        };
+
+        class ConvParam : OpParam {
+          public:
+            ConvParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs,
+                      const framework::AttributeMap &attrs,
+                      const framework::Scope &scope) {
+                filter_ = FilterFrom<framework::LoDTensor>(inputs, scope);
+                input_ = InputFrom<framework::Tensor>(inputs, scope);
+                output_ = OutputFrom<framework::Tensor>(outputs, scope);
+                strides_ = GetAttr<std::vector<int>>("strides", attrs);
+                paddings_ = GetAttr<std::vector<int>>("paddings", attrs);
+                dilations_ = GetAttr<std::vector<int>>("dilations", attrs);
+                groups = GetAttr<int>("groups", attrs);
+            }
+
+            const Tensor *Input() const { return input_; }
+
+            const LoDTensor *Filter() const { return filter_; }
+
+            Tensor *Output() const { return output_; }
+
+            const std::vector<int> &Strides() const { return strides_; }
+
+            const std::vector<int> &Paddings() const { return paddings_; }
+
+            const std::vector<int> &Dilations() const { return dilations_; }
+
+            const int &Groups() const { return groups; }
+
+          private:
+            Tensor *input_;
+            Tensor *output_;
+            LoDTensor *filter_;
+            std::vector<int> strides_;
+            std::vector<int> paddings_;
+            std::vector<int> dilations_;
+            int groups;
+        };
+
+        Print &operator<<(Print &printer, const ConvParam &conv_param);
+
+        class ElementwiseAddParam : OpParam {
+          public:
+            ElementwiseAddParam(const VariableNameMap &inputs,
+                                const VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs,
+                                const framework::Scope &scope) {
+                input_x_ = InputXFrom<framework::Tensor>(inputs, scope);
+                input_y_ = InputYFrom<framework::Tensor>(inputs, scope);
+                out_ = OutFrom<framework::Tensor>(outputs, scope);
+                axis_ = GetAttr<int>("axis", attrs);
+            }
+
+            const Tensor *InputX() const { return input_x_; }
+
+            const Tensor *InputY() const { return input_y_; }
+
+            Tensor *Out() const { return out_; }
+
+            const int &Axis() const { return axis_; }
+
+          private:
+            Tensor *input_x_;
+            Tensor *input_y_;
+            Tensor *out_;
+            int axis_;
+        };
+
+        class MulParam : OpParam {
+          public:
+            MulParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     const framework::Scope &scope) {
+                input_x_ = InputXFrom<framework::Tensor>(inputs, scope);
+                input_y_ = InputYFrom<framework::Tensor>(inputs, scope);
+                out_ = OutFrom<framework::Tensor>(outputs, scope);
+                x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
+                y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
+            }
+
+            const Tensor *InputX() const { return input_x_; }
+
+            const Tensor *InputY() const { return input_y_; }
+
+            Tensor *Out() const { return out_; }
+
+            const int &XNumColDims() const { return x_num_col_dims_; }
+
+            const int &YNumColDims() const { return y_num_col_dims_; }
+
+          private:
+            Tensor *input_x_;
+            Tensor *input_y_;
+            Tensor *out_;
+            int x_num_col_dims_;
+            int y_num_col_dims_;
+        };
+
+        class ConcatParam : public OpParam {
+          public:
+            ConcatParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        const framework::Scope &scope) {
+                inputs_ = InputMultiFrom<framework::Tensor>(inputs, scope);
+                out_ = OutFrom<framework::Tensor>(outputs, scope);
+                axis_ = GetAttr<int>("axis", attrs);
+            }
+
+            std::vector<Tensor *> Inputs() const { return inputs_; }
+
+            Tensor *Out() const { return out_; }
+
+            const int &Axis() const { return axis_; }
+
+          private:
+            std::vector<Tensor *> inputs_;
+            Tensor *out_;
+            int axis_;
+        };
+
+    } // namespace operators
 } // namespace paddle_mobile
--- a/src/platform/data_type.h
+++ b/src/platform/data_type.h
@@ -19,106 +19,107 @@ limitations under the License. */
 #include <typeindex>

 namespace paddle_mobile {
-namespace framework {
+    namespace framework {

-inline proto::VarType::Type ToDataType(std::type_index type) {
-  /*if (typeid(platform::float16).hash_code() == type.hash_code()) {
-    return proto::VarType::FP16;
-  } else */
-  if (typeid(const float).hash_code() == type.hash_code()) {
-    // CPPLint complains Using C-style cast.  Use static_cast<float>() instead
-    // One fix to this is to replace float with const float because
-    // typeid(T) == typeid(const T)
-    // http://en.cppreference.com/w/cpp/language/typeid
-    return proto::VarType::FP32;
-  } else if (typeid(const double).hash_code() == type.hash_code()) {
-    return proto::VarType::FP64;
-  } else if (typeid(const int).hash_code() == type.hash_code()) {
-    return proto::VarType::INT32;
-  } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
-    return proto::VarType::INT64;
-  } else if (typeid(const bool).hash_code() == type.hash_code()) {
-    return proto::VarType::BOOL;
-  } else {
-    //    PADDLE_THROW("Not supported");
-    //    std::cout << "Not supported";
-  }
-}
+        inline proto::VarType::Type ToDataType(std::type_index type) {
+            /*if (typeid(platform::float16).hash_code() == type.hash_code()) {
+              return proto::VarType::FP16;
+            } else */
+            if (typeid(const float).hash_code() == type.hash_code()) {
+                // CPPLint complains Using C-style cast.  Use
+                // static_cast<float>() instead
+                // One fix to this is to replace float with const float because
+                // typeid(T) == typeid(const T)
+                // http://en.cppreference.com/w/cpp/language/typeid
+                return proto::VarType::FP32;
+            } else if (typeid(const double).hash_code() == type.hash_code()) {
+                return proto::VarType::FP64;
+            } else if (typeid(const int).hash_code() == type.hash_code()) {
+                return proto::VarType::INT32;
+            } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
+                return proto::VarType::INT64;
+            } else if (typeid(const bool).hash_code() == type.hash_code()) {
+                return proto::VarType::BOOL;
+            } else {
+                //    PADDLE_THROW("Not supported");
+                //    std::cout << "Not supported";
+            }
+        }

-inline std::type_index ToTypeIndex(proto::VarType::Type type) {
-  switch (type) {
-  //    case proto::VarType::FP16:
-  //      return typeid(platform::float16);
-  case proto::VarType::FP32:
-    return typeid(float);
-  case proto::VarType::FP64:
-    return typeid(double);
-  case proto::VarType::INT32:
-    return typeid(int);
-  case proto::VarType::INT64:
-    return typeid(int64_t);
-  case proto::VarType::BOOL:
-    return typeid(bool);
-  default:
-    //      PADDLE_THROW("Not support type %d", type);
-    printf("Not support type %d", type);
-  }
-}
+        inline std::type_index ToTypeIndex(proto::VarType::Type type) {
+            switch (type) {
+            //    case proto::VarType::FP16:
+            //      return typeid(platform::float16);
+            case proto::VarType::FP32:
+                return typeid(float);
+            case proto::VarType::FP64:
+                return typeid(double);
+            case proto::VarType::INT32:
+                return typeid(int);
+            case proto::VarType::INT64:
+                return typeid(int64_t);
+            case proto::VarType::BOOL:
+                return typeid(bool);
+            default:
+                //      PADDLE_THROW("Not support type %d", type);
+                printf("Not support type %d", type);
+            }
+        }

-template <typename Visitor>
-inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  switch (type) {
-  //    case proto::VarType::FP16:
-  //      visitor.template operator()<platform::float16>();
-  //      break;
-  case proto::VarType::FP32:
-    visitor.template operator()<float>();
-    break;
-  case proto::VarType::FP64:
-    visitor.template operator()<double>();
-    break;
-  case proto::VarType::INT32:
-    visitor.template operator()<int>();
-    break;
-  case proto::VarType::INT64:
-    visitor.template operator()<int64_t>();
-    break;
-  case proto::VarType::BOOL:
-    visitor.template operator()<bool>();
-    break;
-  default:
-    //      PADDLE_THROW("Not supported");
-    printf("Not supported");
-  }
-}
+        template <typename Visitor>
+        inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
+            switch (type) {
+            //    case proto::VarType::FP16:
+            //      visitor.template operator()<platform::float16>();
+            //      break;
+            case proto::VarType::FP32:
+                visitor.template operator()<float>();
+                break;
+            case proto::VarType::FP64:
+                visitor.template operator()<double>();
+                break;
+            case proto::VarType::INT32:
+                visitor.template operator()<int>();
+                break;
+            case proto::VarType::INT64:
+                visitor.template operator()<int64_t>();
+                break;
+            case proto::VarType::BOOL:
+                visitor.template operator()<bool>();
+                break;
+            default:
+                //      PADDLE_THROW("Not supported");
+                printf("Not supported");
+            }
+        }

-inline std::string DataTypeToString(const proto::VarType::Type type) {
-  switch (type) {
-  case proto::VarType::FP16:
-    return "float16";
-  case proto::VarType::FP32:
-    return "float32";
-  case proto::VarType::FP64:
-    return "float64";
-  case proto::VarType::INT16:
-    return "int16";
-  case proto::VarType::INT32:
-    return "int32";
-  case proto::VarType::INT64:
-    return "int64";
-  case proto::VarType::BOOL:
-    return "bool";
-  default:
-    //      PADDLE_THROW("Not support type %d", type);
-    printf("Not support type %d", type);
-  }
-}
+        inline std::string DataTypeToString(const proto::VarType::Type type) {
+            switch (type) {
+            case proto::VarType::FP16:
+                return "float16";
+            case proto::VarType::FP32:
+                return "float32";
+            case proto::VarType::FP64:
+                return "float64";
+            case proto::VarType::INT16:
+                return "int16";
+            case proto::VarType::INT32:
+                return "int32";
+            case proto::VarType::INT64:
+                return "int64";
+            case proto::VarType::BOOL:
+                return "bool";
+            default:
+                //      PADDLE_THROW("Not support type %d", type);
+                printf("Not support type %d", type);
+            }
+        }

-inline std::ostream &operator<<(std::ostream &out,
-                                const proto::VarType::Type &type) {
-  out << DataTypeToString(type);
-  return out;
-}
+        inline std::ostream &operator<<(std::ostream &out,
+                                        const proto::VarType::Type &type) {
+            out << DataTypeToString(type);
+            return out;
+        }

-} // namespace framework
+    } // namespace framework
 } // namespace paddle_mobile
--- a/src/platform/macros.h
+++ b/src/platform/macros.h
@@ -17,9 +17,9 @@ limitations under the License. */
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
 #define DISABLE_COPY_AND_ASSIGN(classname)                                     \
-private:                                                                       \
-  classname(const classname &) = delete;                                       \
-  classname(classname &&) = delete;                                            \
-  classname &operator=(const classname &) = delete;                            \
-  classname &operator=(classname &&) = delete
+  private:                                                                     \
+    classname(const classname &) = delete;                                     \
+    classname(classname &&) = delete;                                          \
+    classname &operator=(const classname &) = delete;                          \
+    classname &operator=(classname &&) = delete
 #endif
--- a/test/elementwise_add_op_test.h
+++ b/test/elementwise_add_op_test.h
+
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+#pragma once
+#include "operators/elementwise_add_op.h"
+#include "test_include.h"
+
+namespace paddle_mobile {
+    namespace framework {
+
+        template <typename Dtype> class TestElementwiseAddOp {
+          public:
+            TestElementwiseAddOp(const Program<Dtype> p) : program_(p) {
+                if (use_optimize_) {
+                    to_predict_program_ = program_.optimizeProgram;
+                } else {
+                    to_predict_program_ = program_.originProgram;
+                }
+
+                const std::vector<std::shared_ptr<BlockDesc>> blocks =
+                    to_predict_program_->Blocks();
+                //  std::cout << " **block size " << blocks.size() << std::endl;
+                for (int i = 0; i < blocks.size(); ++i) {
+                    std::shared_ptr<BlockDesc> block_desc = blocks[i];
+                    std::vector<std::shared_ptr<OpDesc>> ops =
+                        block_desc->Ops();
+                    //    std::cout << " ops " << ops.size() << std::endl;
+                    for (int j = 0; j < ops.size(); ++j) {
+                        std::shared_ptr<OpDesc> op = ops[j];
+                        if (op->Type() == "elementwise_add") {
+                            if (op->GetAttrMap().at("axis").Get<int>() != -1) {
+                                std::cout
+                                    << "attr: axis = "
+                                    << op->GetAttrMap().at("axis").Get<int>()
+                                    << std::endl;
+                            }
+                        }
+                        std::cout << "op:" << op->Type() << std::endl;
+                        if (op->Type() == "elementwise_add" &&
+                            op->Input("X")[0] == "batch_norm_2.tmp_2") {
+                            std::cout << " elementwise_add attr size: "
+                                      << op->GetAttrMap().size() << std::endl;
+                            std::cout
+                                << " inputs size: " << op->GetInputs().size()
+                                << std::endl;
+                            std::cout
+                                << " outputs size: " << op->GetOutputs().size()
+                                << std::endl;
+                            std::cout << " Input X is : " << op->Input("X")[0]
+                                      << std::endl;
+                            std::cout << " Input Y is : " << op->Input("Y")[0]
+                                      << std::endl;
+                            std::cout
+                                << " Output Out is : " << op->Output("Out")[0]
+                                << std::endl;
+                            Attribute axis_attr = op->GetAttrMap().at("axis");
+                            int axis = axis_attr.Get<int>();
+                            std::cout << " Attr axis is : " << axis
+                                      << std::endl;
+
+                            std::shared_ptr<
+                                operators::ElementwiseAddOp<Dtype, float>>
+                                add = std::make_shared<
+                                    operators::ElementwiseAddOp<Dtype, float>>(
+                                    op->Type(), op->GetInputs(),
+                                    op->GetOutputs(), op->GetAttrMap(),
+                                    program_.scope);
+                            ops_of_block_[*block_desc.get()].push_back(add);
+                        }
+                    }
+                }
+            }
+
+            std::shared_ptr<Tensor> predict_add(Tensor &t1, Tensor &t2) {
+                // feed
+                auto scope = program_.scope;
+                Variable *x_feed_value = scope->Var("batch_norm_2.tmp_2");
+                auto tensor_x = x_feed_value->GetMutable<Tensor>();
+                tensor_x->ShareDataWith(t1);
+
+                Variable *y_feed_value = scope->Var("batch_norm_0.tmp_3");
+                auto tensor_y = y_feed_value->GetMutable<Tensor>();
+                tensor_y->ShareDataWith(t2);
+
+                Variable *con_output = scope->Var("elementwise_add_0.tmp_0");
+                Tensor *output_tensor = con_output->GetMutable<Tensor>();
+                output_tensor->mutable_data<float>({1, 3, 224, 224});
+                //  std::cout << typeid(output_tensor).name() << std::endl;
+                //  std::cout << "output_tensor dims: " << output_tensor->dims()
+                //  <<
+                //  std::endl;
+
+                std::shared_ptr<Tensor> out_tensor =
+                    std::make_shared<LoDTensor>();
+                out_tensor.reset(output_tensor);
+
+                predict_add(t1, t2, 0);
+                return out_tensor;
+            }
+
+          private:
+            const framework::Program<Dtype> program_;
+            std::shared_ptr<ProgramDesc> to_predict_program_;
+            std::map<framework::BlockDesc,
+                     std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+                ops_of_block_;
+            bool use_optimize_ = false;
+
+            void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
+                std::shared_ptr<BlockDesc> to_predict_block =
+                    to_predict_program_->Block(block_id);
+                for (int j = 0;
+                     j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+                    auto op = ops_of_block_[*to_predict_block.get()][j];
+                    std::cout << "op -> run()" << std::endl;
+                    op->Run();
+                }
+            }
+        };
+
+        template class TestElementwiseAddOp<CPU>;
+    } // namespace framework
+
+    namespace test {
+        void testElementwiseAdd() {
+            paddle_mobile::Loader<paddle_mobile::CPU> loader;
+            auto program = loader.Load(
+                std::string("../../test/models/"
+                            "image_classification_resnet.inference.model"));
+
+            /// input x (1,3,224,224)
+            paddle_mobile::framework::Tensor inputx;
+            SetupTensor<float>(&inputx, {1, 3, 224, 224}, static_cast<float>(0),
+                               static_cast<float>(1));
+            float *inputx_ptr = inputx.data<float>();
+            /// input y (224,)
+            paddle_mobile::framework::Tensor inputy;
+            SetupTensor<float>(&inputy, {224}, static_cast<float>(0),
+                               static_cast<float>(1));
+            float *inputy_ptr = inputy.data<float>();
+
+            paddle_mobile::framework::TestElementwiseAddOp<paddle_mobile::CPU>
+                testElementwiseAddOp(program);
+
+            auto output_add = testElementwiseAddOp.predict_add(inputx, inputy);
+            float *output_add_ptr = output_add->data<float>();
+            for (int j = 0; j < output_add->numel(); ++j) {
+                // std::cout << "value of output: " << output_add_ptr[j] <<
+                // std::endl;
+            }
+
+            /// output (1,3,224,224)
+            std::cout << "output memory size : " << output_add->memory_size()
+                      << std::endl;
+            std::cout << "output numel : " << output_add->numel() << std::endl;
+
+            std::cout << inputx_ptr[226] << " + " << inputy_ptr[2] << " = "
+                      << output_add_ptr[226] << std::endl;
+        }
+    } // namespace test
+} // namespace paddle_mobile
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -16,6 +16,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ==============================================================================*/

+#include "elementwise_add_op_test.h"
 #include "framework/executor.h"
 #include "io.h"
 #include "test_helper.h"
@@ -36,45 +37,44 @@ SOFTWARE.
 //}

 int main() {
-  std::string data_set = "cifar10";
-  //
-  //    if (data_set == "cifar10") {
-  //        SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
-  //                           static_cast<float>(0), static_cast<float>(1));
-  //    } else if (data_set == "imagenet") {
-  //        SetupTensor<float>(&input, {FLAGS_batch_size, 3, 224, 224},
-  //                           static_cast<float>(0), static_cast<float>(1));
-  //    } else {
-  //        LOG(FATAL) << "Only cifar10 or imagenet is supported.";
-  //    }
+    std::string data_set = "cifar10";
+    //
+    //    if (data_set == "cifar10") {
+    //        SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
+    //                           static_cast<float>(0), static_cast<float>(1));
+    //    } else if (data_set == "imagenet") {
+    //        SetupTensor<float>(&input, {FLAGS_batch_size, 3, 224, 224},
+    //                           static_cast<float>(0), static_cast<float>(1));
+    //    } else {
+    //        LOG(FATAL) << "Only cifar10 or imagenet is supported.";
+    //    }

-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //../../test/models/image_classification_resnet.inference.model
-  auto program = loader.Load(std::string(
-      "../../test/models/image_classification_resnet.inference.model"));
+    paddle_mobile::Loader<paddle_mobile::CPU> loader;
+    auto program = loader.Load(std::string(
+        "../../test/models/image_classification_resnet.inference.model"));

-  paddle_mobile::framework::Executor<paddle_mobile::CPU> executor(program);
+    paddle_mobile::framework::Executor<paddle_mobile::CPU> executor(program);

-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 3, 32, 32}, static_cast<float>(0),
-                     static_cast<float>(1));
-  float *input_ptr = input.data<float>();
-  for (int i = 0; i < input.numel(); ++i) {
-    //    std::cout << input_ptr[i] << std::endl;
-  }
+    paddle_mobile::framework::Tensor input;
+    SetupTensor<float>(&input, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
+    float *input_ptr = input.data<float>();
+    for (int i = 0; i < input.numel(); ++i) {
+        //    std::cout << input_ptr[i] << std::endl;
+    }

-  //  std::cout << "input: " << input.memory_size() << std::endl;
-  //  std::cout << "input: " << input.numel() << std::endl;
+    //  std::cout << "input: " << input.memory_size() << std::endl;
+    //  std::cout << "input: " << input.numel() << std::endl;

-  auto output = executor.predict(input);
+    auto output = executor.predict(input);

-  //  std::cout << "output: " << output->memory_size() << std::endl;
-  //  std::cout << "output: " << output->numel() << std::endl;
+    //  std::cout << "output: " << output->memory_size() << std::endl;
+    //  std::cout << "output: " << output->numel() << std::endl;

-  //  float* output_ptr = output->data<float>();
-  //  for (int j = 0; j < output->numel(); ++j) {
-  //    std::cout << " value of output: " << output_ptr[j] << std::endl;
-  //  }
-
-  return 0;
+    //  float* output_ptr = output->data<float>();
+    //  for (int j = 0; j < output->numel(); ++j) {
+    //    std::cout << " value of output: " << output_ptr[j] << std::endl;
+    //
+    paddle_mobile::test::testElementwiseAdd();
+    return 0;
 }
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -15,20 +15,21 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ==============================================================================*/
-
-#include <random>
+#pragma once
 #include "framework/ddim.h"
 #include "framework/tensor.h"
+#include <random>

 template <typename T>
-void SetupTensor(paddle_mobile::framework::Tensor* input,
+void SetupTensor(paddle_mobile::framework::Tensor *input,
                 paddle_mobile::framework::DDim dims, T lower, T upper) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
+    static unsigned int seed = 100;
+    std::mt19937 rng(seed++);
+    std::uniform_real_distribution<double> uniform_dist(0, 1);

-  T* input_ptr = input->mutable_data<T>(dims);
-  for (int i = 0; i < input->numel(); ++i) {
-    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
+    T *input_ptr = input->mutable_data<T>(dims);
+    for (int i = 0; i < input->numel(); ++i) {
+        input_ptr[i] =
+            static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
 }
--- a/test/test_include.h
+++ b/test/test_include.h
+#include "framework/block_desc.h"
+#include "framework/framework.pb.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
+#include "framework/program.h"
+#include "framework/program_desc.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
+#include "framework/variable.h"
+#include "framework/variable.h"
+#include "io.h"
+#include "test_helper.h"
+#include <map>
+#include <string>
+#include <vector>
\ No newline at end of file
--- a/android-cmake/android.toolchain.cmake
+++ b/android-cmake/android.toolchain.cmake
--- a/ios-cmake/ios.toolchain.cmake
+++ b/ios-cmake/ios.toolchain.cmake
--- a/.clang_format.hook
+++ b/.clang_format.hook
--- a/tools/pre-commit.hooks/.copyright.hook
+++ b/tools/pre-commit.hooks/.copyright.hook
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/tools/pre-commit.hooks/clang-format.bash
+++ b/tools/pre-commit.hooks/clang-format.bash
+#!/bin/bash
+set -e
+
+readonly VERSION="version 3."
+
+version=$(clang-format -version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    echo "clang-format version check failed."
+    echo "a version contains '$VERSION' is needed, but get '$version'"
+    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+    exit -1
+fi
+
+clang-format $@
--- a/tools/pre-commit.hooks/copyright.py
+++ b/tools/pre-commit.hooks/copyright.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line) == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT " in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(
+                    first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/tools/pre-commit.hooks/cpplint.bash
+++ b/tools/pre-commit.hooks/cpplint.bash
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+#iclang-tidy *.[ch]pp -checks=* 
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'|grep -v ".pb." | grep -v "third-party/"); do
+    cpplint $file
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+exit $TOTAL_ERRORS
+