add clang-format clang-tidy hook

47e5978f · 朔-望 · ab5a35c2 · 47e5978f · 47e5978f · 47e5978f
76 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,11 +20,20 @@ repos:
    -   id: trailing-whitespace
        files: (src).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$

+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: bash ./tools/pre-commit.hooks/.clang-format.hook -i
+        language: system
+        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+
 -   repo: local
    hooks:
    -   id: clang-tidy
        name: clang-tidy
-        description: Format files with tidy.
+        description: Check C++ code style using clang-tidy.
        entry: bash ./tools/pre-commit.hooks/.clang-tidy.hook -i
        language: system
        files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$

--- a/src/common/log.h
+++ b/src/common/log.h
@@ -27,146 +27,145 @@ SOFTWARE.

 namespace paddle_mobile {

-    enum LogLevel {
-        kNO_LOG,
-        kLOG_ERROR,
-        kLOG_WARNING,
-        kLOG_INFO,
-        kLOG_DEBUG,
-        kLOG_DEBUG1,
-        kLOG_DEBUG2,
-        kLOG_DEBUG3,
-        kLOG_DEBUG4
-    };
-
-    // log level
-    static LogLevel log_level = kLOG_DEBUG4;
-
-    static std::vector<std::string> logs{"NO",      "ERROR ",  "WARNING",
-                                         "INFO   ", "DEBUG  ", "DEBUG1 ",
-                                         "DEBUG2 ", "DEBUG3 ", "DEBUG4 "};
-    struct ToLog;
-    struct Print;
-
-    struct Print {
-        friend struct ToLog;
-
-        template <typename T> Print &operator<<(T const &value) {
-            buffer_ << value;
-            return *this;
+enum LogLevel {
+    kNO_LOG,
+    kLOG_ERROR,
+    kLOG_WARNING,
+    kLOG_INFO,
+    kLOG_DEBUG,
+    kLOG_DEBUG1,
+    kLOG_DEBUG2,
+    kLOG_DEBUG3,
+    kLOG_DEBUG4
+};
+
+// log level
+static LogLevel log_level = kLOG_DEBUG4;
+
+static std::vector<std::string> logs{"NO",      "ERROR ",  "WARNING",
+                                     "INFO   ", "DEBUG  ", "DEBUG1 ",
+                                     "DEBUG2 ", "DEBUG3 ", "DEBUG4 "};
+struct ToLog;
+struct Print;
+
+struct Print {
+    friend struct ToLog;
+
+    template <typename T> Print &operator<<(T const &value) {
+        buffer_ << value;
+        return *this;
+    }
+
+  private:
+    void print(LogLevel level) {
+        buffer_ << std::endl;
+        if (level == kLOG_ERROR) {
+            std::cerr << buffer_.str();
+        } else {
+            std::cout << buffer_.str();
        }
-
-      private:
-        void print(LogLevel level) {
-            buffer_ << std::endl;
-            if (level == kLOG_ERROR) {
-                std::cerr << buffer_.str();
-            } else {
-                std::cout << buffer_.str();
-            }
-        }
-        std::ostringstream buffer_;
-    };
-
-    struct ToLog {
-        ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "")
-            : level_(level) {
-            unsigned blanks =
-                (unsigned)(level > kLOG_DEBUG ? (level - kLOG_DEBUG) * 4 : 1);
-            printer_ << logs[level] << " " << info << ":"
-                     << std::string(blanks, ' ');
-        }
-
-        template <typename T> ToLog &operator<<(T const &value) {
-            printer_ << value;
-            return *this;
-        }
-
-        ~ToLog() { printer_.print(level_); }
-
-      private:
-        LogLevel level_;
-        Print printer_;
-    };
+    }
+    std::ostringstream buffer_;
+};
+
+struct ToLog {
+    ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "")
+        : level_(level) {
+        unsigned blanks =
+            (unsigned)(level > kLOG_DEBUG ? (level - kLOG_DEBUG) * 4 : 1);
+        printer_ << logs[level] << " " << info << ":"
+                 << std::string(blanks, ' ');
+    }
+
+    template <typename T> ToLog &operator<<(T const &value) {
+        printer_ << value;
+        return *this;
+    }
+
+    ~ToLog() { printer_.print(level_); }
+
+  private:
+    LogLevel level_;
+    Print printer_;
+};

 #define LOG(level)                                                             \
    if (level > paddle_mobile::log_level) {                                    \
    } else                                                                     \
-    paddle_mobile::ToLog(                                                      \
-        level,                                                                 \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
-            .str())
+        paddle_mobile::ToLog(                                                  \
+            level, (std::stringstream()                                        \
+                    << "[file: "                                               \
+                    << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1)  \
+                                               : __FILE__)                     \
+                    << "] [line: " << __LINE__ << "] ")                        \
+                       .str())

 #define DLOG                                                                   \
    if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                \
    } else                                                                     \
-    paddle_mobile::ToLog(                                                      \
-        paddle_mobile::kLOG_DEBUG,                                             \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
-            .str())
-}
+        paddle_mobile::ToLog(                                                  \
+            paddle_mobile::kLOG_DEBUG,                                         \
+            (std::stringstream()                                               \
+             << "[file: "                                                      \
+             << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1)         \
+                                        : __FILE__)                            \
+             << "] [line: " << __LINE__ << "] ")                               \
+                .str())
+} // namespace paddle_mobile

 #define LOGF(level, format, ...)                                               \
    if (level > paddle_mobile::log_level) {                                    \
    } else                                                                     \
-    printf(format, ##__VA_ARGS__)
+        printf(format, ##__VA_ARGS__)

 #define DLOGF(format, ...)                                                     \
    if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                \
    } else                                                                     \
-    printf(format, ##__VA_ARGS__)
+        printf(format, ##__VA_ARGS__)

 #else

 namespace paddle_mobile {

-    enum LogLevel {
-        kNO_LOG,
-        kLOG_ERROR,
-        kLOG_WARNING,
-        kLOG_INFO,
-        kLOG_DEBUG,
-        kLOG_DEBUG1,
-        kLOG_DEBUG2,
-        kLOG_DEBUG3,
-        kLOG_DEBUG4
-    };
-
-    struct ToLog;
-    struct Print {
-        friend struct ToLog;
-        template <typename T> Print &operator<<(T const &value) {}
-
-      private:
-    };
-
-    struct ToLog {
-        ToLog(LogLevel level) {}
-
-        template <typename T> ToLog &operator<<(T const &value) {
-            return *this;
-        }
-    };
+enum LogLevel {
+    kNO_LOG,
+    kLOG_ERROR,
+    kLOG_WARNING,
+    kLOG_INFO,
+    kLOG_DEBUG,
+    kLOG_DEBUG1,
+    kLOG_DEBUG2,
+    kLOG_DEBUG3,
+    kLOG_DEBUG4
+};
+
+struct ToLog;
+struct Print {
+    friend struct ToLog;
+    template <typename T> Print &operator<<(T const &value) {}
+
+  private:
+};
+
+struct ToLog {
+    ToLog(LogLevel level) {}
+
+    template <typename T> ToLog &operator<<(T const &value) { return *this; }
+};

 #define LOG(level)                                                             \
    if (true) {                                                                \
    } else                                                                     \
-    paddle_mobile::ToLog(level)
+        paddle_mobile::ToLog(level)

 #define DLOG                                                                   \
    if (true) {                                                                \
    } else                                                                     \
-    paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)
+        paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)

 #define LOGF(level, format, ...)

 #define DLOGF(format, ...)
-}
+} // namespace paddle_mobile

 #endif
--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -24,30 +24,29 @@ SOFTWARE.
 namespace paddle_mobile {

 namespace framework {
-template<typename Dtype> class OperatorBase;
+template <typename Dtype> class OperatorBase;
 class OpDesc;
 class BlockDesc;
 class InferShapeContext;
-}
+} // namespace framework

 using VariableNameMap = std::map<std::string, std::vector<std::string>>;

-template<typename Dtype>
+template <typename Dtype>
 using OpCreator = std::function<framework::OperatorBase<Dtype> *(
    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
    const VariableNameMap & /*outputs*/,
    const framework::AttributeMap & /*attrs*/)>;

 using GradOpMakerFN =
-std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
-    const framework::OpDesc &,
-    const std::unordered_set<std::string> & /*no_grad_set*/,
-    std::unordered_map<std::string, std::string> * /*grad_to_var*/,
-    const std::vector<framework::BlockDesc *> &grad_block)>;
+    std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
+        const framework::OpDesc &,
+        const std::unordered_set<std::string> & /*no_grad_set*/,
+        std::unordered_map<std::string, std::string> * /*grad_to_var*/,
+        const std::vector<framework::BlockDesc *> &grad_block)>;

-using InferVarTypeFN =
-std::function<void(const framework::OpDesc & /*op_desc*/,
-                   framework::BlockDesc * /*block*/)>;
+using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
+                                          framework::BlockDesc * /*block*/)>;

 using InferShapeFN = std::function<void(framework::InferShapeContext *)>;
-};
+}; // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -24,7 +24,7 @@ enum class Precision : int { FP32 = 0 };
 //! device type
 enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };

-template<DeviceTypeEnum T> struct DeviceType {};
+template <DeviceTypeEnum T> struct DeviceType {};

 typedef DeviceType<kCPU> CPU;
 typedef DeviceType<kFPGA> FPGA;
@@ -32,32 +32,32 @@ typedef DeviceType<kGPU_MALI> GPU_MALI;

 //! data type
 enum DataType {
-  PM_INVALID = -1,
-  PM_HALF = 0,
-  PM_FLOAT = 1,
-  PM_DOUBLE = 2,
-  PM_INT8 = 3,
-  PM_INT16 = 4,
-  PM_INT32 = 5,
-  PM_INT64 = 6,
-  PM_UINT8 = 7,
-  PM_UINT16 = 8,
-  PM_UINT32 = 9,
-  PM_STRING = 10,
-  PM_BOOL = 11,
-  PM_SHAPE = 12,
-  PM_TENSOR = 13
+    PM_INVALID = -1,
+    PM_HALF = 0,
+    PM_FLOAT = 1,
+    PM_DOUBLE = 2,
+    PM_INT8 = 3,
+    PM_INT16 = 4,
+    PM_INT32 = 5,
+    PM_INT64 = 6,
+    PM_UINT8 = 7,
+    PM_UINT16 = 8,
+    PM_UINT32 = 9,
+    PM_STRING = 10,
+    PM_BOOL = 11,
+    PM_SHAPE = 12,
+    PM_TENSOR = 13
 };
 //!
 enum PMStatus {
-  PMSuccess = 0xFF,        /*!< No errors */
-  PMNotInitialized = 0x01, /*!< Data not initialized. */
-  PMInvalidValue = 0x02,   /*!< Incorrect variable value. */
-  PMMemAllocFailed = 0x03, /*!< Memory allocation error. */
-  PMUnKownError = 0x04,    /*!< Unknown error. */
-  PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/
-  PMOutOfMem = 0x06,       /*!< OOM error*/
-  PMUnImplError = 0x07,    /*!< Unimplement error. */
-  PMWrongDevice = 0x08     /*!< un-correct device. */
+    PMSuccess = 0xFF,        /*!< No errors */
+    PMNotInitialized = 0x01, /*!< Data not initialized. */
+    PMInvalidValue = 0x02,   /*!< Incorrect variable value. */
+    PMMemAllocFailed = 0x03, /*!< Memory allocation error. */
+    PMUnKownError = 0x04,    /*!< Unknown error. */
+    PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/
+    PMOutOfMem = 0x06,       /*!< OOM error*/
+    PMUnImplError = 0x07,    /*!< Unimplement error. */
+    PMWrongDevice = 0x08     /*!< un-correct device. */
 };
-}
+} // namespace paddle_mobile
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -21,79 +21,79 @@ SOFTWARE.
 #pragma once

 namespace paddle_mobile {
-template<int ID, typename Type> struct IDToType { typedef Type type_t; };
+template <int ID, typename Type> struct IDToType { typedef Type type_t; };

-template<typename F, typename... Ts> struct VariantHelper {
-  static const size_t size = sizeof(F) > VariantHelper<Ts...>::size
-                             ? sizeof(F)
-                             : VariantHelper<Ts...>::size;
+template <typename F, typename... Ts> struct VariantHelper {
+    static const size_t size = sizeof(F) > VariantHelper<Ts...>::size
+                                   ? sizeof(F)
+                                   : VariantHelper<Ts...>::size;

-  inline static void Destroy(size_t id, void *data) {
-    if (id == typeid(F).hash_code()) {
-      reinterpret_cast<F *>(data)->~F();
-    } else {
-      VariantHelper<Ts...>::Destroy(id, data);
+    inline static void Destroy(size_t id, void *data) {
+        if (id == typeid(F).hash_code()) {
+            reinterpret_cast<F *>(data)->~F();
+        } else {
+            VariantHelper<Ts...>::Destroy(id, data);
+        }
    }
-  }
 };

-template<typename F> struct VariantHelper<F> {
-  static const size_t size = sizeof(F);
-  inline static void Destroy(size_t id, void *data) {
-    if (id == typeid(F).hash_code()) {
-      //              reinterpret_cast<F*>(data)->~F();
-    } else {
-      //              std::cout << "未匹配到 " << std::endl;
+template <typename F> struct VariantHelper<F> {
+    static const size_t size = sizeof(F);
+    inline static void Destroy(size_t id, void *data) {
+        if (id == typeid(F).hash_code()) {
+            //              reinterpret_cast<F*>(data)->~F();
+        } else {
+            //              std::cout << "未匹配到 " << std::endl;
+        }
    }
-  }
 };

-template<size_t size> class RawData {
-public:
-  char data[size];
-  RawData() {}
-  RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
-  //      void operator=(const RawData &raw_data){
-  //        strcpy(data, raw_data.data);
-  //      }
+template <size_t size> class RawData {
+  public:
+    char data[size];
+    RawData() {}
+    RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
+    //      void operator=(const RawData &raw_data){
+    //        strcpy(data, raw_data.data);
+    //      }
 };

-template<typename... Ts> struct Variant {
-  Variant(const Variant &variant) {
-    //        std::cout << " 赋值构造函数 " << std::endl;
-    type_id = variant.type_id;
-    data = variant.data;
-  }
+template <typename... Ts> struct Variant {
+    Variant(const Variant &variant) {
+        //        std::cout << " 赋值构造函数 " << std::endl;
+        type_id = variant.type_id;
+        data = variant.data;
+    }

-  Variant() : type_id(invalid_type()) {}
-  ~Variant() {
-    //        helper::Destroy(type_id, &data);
-  }
+    Variant() : type_id(invalid_type()) {}
+    ~Variant() {
+        //        helper::Destroy(type_id, &data);
+    }

-  template<typename T, typename... Args> void Set(Args &&... args) {
-    helper::Destroy(type_id, &data);
-    new(&data) T(std::forward<Args>(args)...);
-    type_id = typeid(T).hash_code();
-  }
+    template <typename T, typename... Args> void Set(Args &&... args) {
+        helper::Destroy(type_id, &data);
+        new (&data) T(std::forward<Args>(args)...);
+        type_id = typeid(T).hash_code();
+    }

-  template<typename T> T &Get() const {
-    if (type_id == typeid(T).hash_code()) {
-      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
-    } else {
-      //      std::cout << " bad cast in variant " << std::endl;
-      throw std::bad_cast();
+    template <typename T> T &Get() const {
+        if (type_id == typeid(T).hash_code()) {
+            return *const_cast<T *>(reinterpret_cast<const T *>(&data));
+        } else {
+            //      std::cout << " bad cast in variant " << std::endl;
+            throw std::bad_cast();
+        }
    }
-  }

-  size_t TypeId() const { return type_id; }
+    size_t TypeId() const { return type_id; }

-private:
-  static inline size_t invalid_type() { return typeid(void).hash_code(); }
-  typedef VariantHelper<Ts...> helper;
-  size_t type_id;
-  RawData<helper::size> data;
+  private:
+    static inline size_t invalid_type() { return typeid(void).hash_code(); }
+    typedef VariantHelper<Ts...> helper;
+    size_t type_id;
+    RawData<helper::size> data;
 };

-template<typename T> struct Vistor { typedef T type_t; };
+template <typename T> struct Vistor { typedef T type_t; };

 } // namespace paddle_mobile
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -27,104 +27,102 @@ namespace framework {
 class BlockDesc;

 class Attribute {
-public:
-  static Attribute
-  GetAttrValue(const proto::OpDesc::Attr &attr_desc) {
-    //    std::cout << "begin get attr value" << std::endl;
-    Attribute attr;
-    switch (attr_desc.type()) {
-    case proto::AttrType::BOOLEAN: {
-      attr.Set<bool>(attr_desc.b());
-      break;
+  public:
+    static Attribute GetAttrValue(const proto::OpDesc::Attr &attr_desc) {
+        //    std::cout << "begin get attr value" << std::endl;
+        Attribute attr;
+        switch (attr_desc.type()) {
+        case proto::AttrType::BOOLEAN: {
+            attr.Set<bool>(attr_desc.b());
+            break;
+        }
+        case proto::AttrType::INT: {
+            attr.Set<int>(attr_desc.i());
+            break;
+        }
+        case proto::AttrType::FLOAT: {
+            attr.Set<float>(attr_desc.f());
+            break;
+        }
+        case proto::AttrType::STRING: {
+            attr.Set<std::string>(attr_desc.s());
+            break;
+        }
+        case proto::AttrType::BOOLEANS: {
+            std::vector<bool> val(attr_desc.bools_size());
+            for (int i = 0; i < attr_desc.bools_size(); ++i) {
+                val[i] = attr_desc.bools(i);
+            }
+            attr.Set<std::vector<bool>>(val);
+            break;
+        }
+        case proto::AttrType::INTS: {
+            std::vector<int> val(attr_desc.ints_size());
+            for (int i = 0; i < attr_desc.ints_size(); ++i) {
+                val[i] = attr_desc.ints(i);
+            }
+            attr.Set<std::vector<int>>(val);
+            break;
+        }
+        case proto::AttrType::FLOATS: {
+            std::vector<float> val(attr_desc.floats_size());
+            for (int i = 0; i < attr_desc.floats_size(); ++i) {
+                val[i] = attr_desc.floats(i);
+            }
+            attr.Set<std::vector<float>>(val);
+            break;
+        }
+        case proto::AttrType::STRINGS: {
+            std::vector<std::string> val(attr_desc.strings_size());
+            for (int i = 0; i < attr_desc.strings_size(); ++i) {
+                val[i] = attr_desc.strings(i);
+            }
+            attr.Set<std::vector<std::string>>(val);
+            break;
+        }
+        case proto::AttrType::LONG: {
+            attr.Set<int64_t>(attr_desc.l());
+            break;
+        }
+        default:
+            //        std::cout << " not support " << std::endl;
+            break;
+        }
+        //    std::cout << "end get attr value" << std::endl;
+        return attr;
    }
-    case proto::AttrType::INT: {
-      attr.Set<int>(attr_desc.i());
-      break;
-    }
-    case proto::AttrType::FLOAT: {
-      attr.Set<float>(attr_desc.f());
-      break;
-    }
-    case proto::AttrType::STRING: {
-      attr.Set<std::string>(attr_desc.s());
-      break;
-    }
-    case proto::AttrType::BOOLEANS: {
-      std::vector<bool> val(attr_desc.bools_size());
-      for (int i = 0; i < attr_desc.bools_size(); ++i) {
-        val[i] = attr_desc.bools(i);
-      }
-      attr.Set<std::vector<bool>>(val);
-      break;
-    }
-    case proto::AttrType::INTS: {
-      std::vector<int> val(attr_desc.ints_size());
-      for (int i = 0; i < attr_desc.ints_size(); ++i) {
-        val[i] = attr_desc.ints(i);
-      }
-      attr.Set<std::vector<int>>(val);
-      break;
-    }
-    case proto::AttrType::FLOATS: {
-      std::vector<float> val(attr_desc.floats_size());
-      for (int i = 0; i < attr_desc.floats_size(); ++i) {
-        val[i] = attr_desc.floats(i);
-      }
-      attr.Set<std::vector<float>>(val);
-      break;
-    }
-    case proto::AttrType::STRINGS: {
-      std::vector<std::string> val(attr_desc.strings_size());
-      for (int i = 0; i < attr_desc.strings_size(); ++i) {
-        val[i] = attr_desc.strings(i);
-      }
-      attr.Set<std::vector<std::string>>(val);
-      break;
-    }
-    case proto::AttrType::LONG: {
-      attr.Set<int64_t>(attr_desc.l());
-      break;
-    }
-    default:
-      //        std::cout << " not support " << std::endl;
-      break;
-    }
-    //    std::cout << "end get attr value" << std::endl;
-    return attr;
-  }

-  Attribute() {}
-  template<typename T, typename... Args>
-  Attribute &Set(Args &&... args) {
-    variant_.Set<T>(args...);
-    return *this;
-  }
+    Attribute() {}
+    template <typename T, typename... Args> Attribute &Set(Args &&... args) {
+        variant_.Set<T>(args...);
+        return *this;
+    }

-  template<typename T> T &Get() const { return variant_.Get<T>(); }
+    template <typename T> T &Get() const { return variant_.Get<T>(); }

-private:
-  Variant<int, float, std::string, std::vector<int>,
-          std::vector<float>, std::vector<std::string>, bool,
-          std::vector<bool>, BlockDesc *, int64_t>
-      variant_;
+  private:
+    Variant<int, float, std::string, std::vector<int>, std::vector<float>,
+            std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
+            int64_t>
+        variant_;
 };

 using AttributeMap = std::unordered_map<std::string, Attribute>;

 class AttrReader {
-public:
-  explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
+  public:
+    explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}

-  template<typename T> inline T Get(const std::string &name) const {
-    //          PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should
-    //          be in
-    //          AttributeMap",
-    //                         name);
-    return ((Attribute) attrs_.at(name)).Get<T>();
-  }
+    template <typename T> inline T Get(const std::string &name) const {
+        //          PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should
+        //          be in
+        //          AttributeMap",
+        //                         name);
+        return ((Attribute)attrs_.at(name)).Get<T>();
+    }

-private:
-  const AttributeMap &attrs_;
+  private:
+    const AttributeMap &attrs_;
 };

 } // namespace framework

--- a/src/framework/block_desc.cpp
+++ b/src/framework/block_desc.cpp
@@ -22,28 +22,28 @@ namespace paddle_mobile {
 namespace framework {

 std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
-  std::vector<std::shared_ptr<VarDesc>> res;
-  for (const auto &p : vars_) {
-    res.push_back(p.second);
-  }
-  return res;
+    std::vector<std::shared_ptr<VarDesc>> res;
+    for (const auto &p : vars_) {
+        res.push_back(p.second);
+    }
+    return res;
 }

 std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
-  std::vector<std::shared_ptr<OpDesc>> res;
-  for (const auto &op : ops_) {
-    res.push_back(op);
-  }
-  return res;
+    std::vector<std::shared_ptr<OpDesc>> res;
+    for (const auto &op : ops_) {
+        res.push_back(op);
+    }
+    return res;
 }

 BlockDesc::BlockDesc(const proto::BlockDesc &desc) : desc_(desc) {
-  for (const proto::VarDesc &var_desc : desc_.vars()) {
-    vars_[var_desc.name()].reset(new VarDesc(var_desc));
-  }
-  for (const proto::OpDesc &op_desc : desc_.ops()) {
-    ops_.emplace_back(new framework::OpDesc(op_desc));
-  }
+    for (const proto::VarDesc &var_desc : desc_.vars()) {
+        vars_[var_desc.name()].reset(new VarDesc(var_desc));
+    }
+    for (const proto::OpDesc &op_desc : desc_.ops()) {
+        ops_.emplace_back(new framework::OpDesc(op_desc));
+    }
 }

 } // namespace framework

--- a/src/framework/block_desc.h
+++ b/src/framework/block_desc.h
@@ -27,32 +27,29 @@ namespace paddle_mobile {
 namespace framework {

 class BlockDesc : PaddleMobileObject {
-public:
-  BlockDesc(const proto::BlockDesc &desc);
+  public:
+    BlockDesc(const proto::BlockDesc &desc);

-  const int &ID() const { return desc_.idx(); }
+    const int &ID() const { return desc_.idx(); }

-  const int &Parent() const { return desc_.parent_idx(); }
+    const int &Parent() const { return desc_.parent_idx(); }

-  bool operator==(
-      const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() &&
-        this->Parent() == in_block.Parent();
-  }
+    bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
+        return this->ID() == in_block.ID() &&
+               this->Parent() == in_block.Parent();
+    }

-  bool operator<(
-      const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() &&
-        this->Parent() < in_block.Parent();
-  }
+    bool operator<(const paddle_mobile::framework::BlockDesc &in_block) const {
+        return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
+    }

-  std::vector<std::shared_ptr<VarDesc>> Vars() const;
-  std::vector<std::shared_ptr<OpDesc>> Ops() const;
+    std::vector<std::shared_ptr<VarDesc>> Vars() const;
+    std::vector<std::shared_ptr<OpDesc>> Ops() const;

-private:
-  proto::BlockDesc desc_;
-  std::vector<std::shared_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
+  private:
+    proto::BlockDesc desc_;
+    std::vector<std::shared_ptr<OpDesc>> ops_;
+    std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
 };

 } // namespace framework
@@ -60,14 +57,14 @@ private:

 namespace std {

-template<> struct hash<paddle_mobile::framework::BlockDesc> {
-  typedef paddle_mobile::framework::BlockDesc argument_type;
-  typedef std::size_t result_type;
-  result_type operator()(argument_type const &s) const noexcept {
-    result_type const h1(std::hash<int>{}(s.ID()));
-    result_type const h2(std::hash<int>{}(s.ID()));
-    return h1 ^ (h2 << 1);
-  }
+template <> struct hash<paddle_mobile::framework::BlockDesc> {
+    typedef paddle_mobile::framework::BlockDesc argument_type;
+    typedef std::size_t result_type;
+    result_type operator()(argument_type const &s) const noexcept {
+        result_type const h1(std::hash<int>{}(s.ID()));
+        result_type const h2(std::hash<int>{}(s.ID()));
+        return h1 ^ (h2 << 1);
+    }
 };

 } // namespace std
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -22,42 +22,45 @@ namespace paddle_mobile {
 namespace framework {

 enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
+    kNHWC = 0,
+    kNCHW = 1,
+    kAnyLayout = 2,
 };

 inline DataLayout StringToDataLayout(const std::string &str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
+    std::string s(str);
+    for (size_t i = 0; i < s.size(); ++i) {
+        s[i] = toupper(s[i]);
+    }

-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else {
-    //    std::cout << "Unknown storage order string: %s", s;
-  }
+    if (s == "NHWC") {
+        return DataLayout::kNHWC;
+    } else if (s == "NCHW") {
+        return DataLayout::kNCHW;
+    } else if (s == "ANYLAYOUT") {
+        return DataLayout::kAnyLayout;
+    } else {
+        //    std::cout << "Unknown storage order string: %s", s;
+    }
 }

 inline std::string DataLayoutToString(const DataLayout &data_layout) {
-  switch (data_layout) {
-  case DataLayout::kNHWC:return "NHWC";
-  case DataLayout::kNCHW:return "NCHW";
-  case DataLayout::kAnyLayout:return "ANY_LAYOUT";
-  default:break;
-    //      std::cout << "unknown DataLayou %d", data_layout;
-  }
+    switch (data_layout) {
+    case DataLayout::kNHWC:
+        return "NHWC";
+    case DataLayout::kNCHW:
+        return "NCHW";
+    case DataLayout::kAnyLayout:
+        return "ANY_LAYOUT";
+    default:
+        break;
+        //      std::cout << "unknown DataLayou %d", data_layout;
+    }
 }

-inline std::ostream &operator<<(std::ostream &out,
-                                const DataLayout &l) {
-  out << DataLayoutToString(l);
-  return out;
+inline std::ostream &operator<<(std::ostream &out, const DataLayout &l) {
+    out << DataLayoutToString(l);
+    return out;
 }

 } // namespace framework

--- a/src/framework/data_transform.cpp
+++ b/src/framework/data_transform.cpp
@@ -24,68 +24,68 @@ namespace paddle_mobile {
 namespace framework {

 static void PassTensorData(Tensor *from, Tensor *to) {
-  to->ShareDataWith(*from);
-  *from = Tensor();
+    to->ShareDataWith(*from);
+    *from = Tensor();
 }

 void DataTransform(const OpKernelType &expected_kernel_type,
                   const OpKernelType &kernel_type_for_var,
                   const Tensor &input_tensor, Tensor *output_tensor) {
-  bool transformed = false;
-  Tensor in;
-  in.ShareDataWith(input_tensor);
-  Tensor out;
+    bool transformed = false;
+    Tensor in;
+    in.ShareDataWith(input_tensor);
+    Tensor out;

-  //  // do layout transform
-  //  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-  //                          kernel_type_for_var.data_layout_)) {
-  //    TransDataLayout(kernel_type_for_var, expected_kernel_type, in,
-  //    &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do data type transform
-  //  if (expected_kernel_type.data_type_ !=
-  //  kernel_type_for_var.data_type_) {
-  //    TransDataType(kernel_type_for_var, expected_kernel_type, in,
-  //    &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do device transform
-  //  if (!platform::is_same_place(kernel_type_for_var.place_,
-  //                               expected_kernel_type.place_)) {
-  //    TransDataDevice(in, expected_kernel_type.place_, &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  PADDLE_ENFORCE(transformed, "No transform is applied, please
-  //  check!");
-  // get output data
-  output_tensor->ShareDataWith(in);
+    //  // do layout transform
+    //  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+    //                          kernel_type_for_var.data_layout_)) {
+    //    TransDataLayout(kernel_type_for_var, expected_kernel_type, in,
+    //    &out);
+    //    transformed = true;
+    //    PassTensorData(&out, &in);
+    //  }
+    //
+    //  // do data type transform
+    //  if (expected_kernel_type.data_type_ !=
+    //  kernel_type_for_var.data_type_) {
+    //    TransDataType(kernel_type_for_var, expected_kernel_type, in,
+    //    &out);
+    //    transformed = true;
+    //    PassTensorData(&out, &in);
+    //  }
+    //
+    //  // do device transform
+    //  if (!platform::is_same_place(kernel_type_for_var.place_,
+    //                               expected_kernel_type.place_)) {
+    //    TransDataDevice(in, expected_kernel_type.place_, &out);
+    //    transformed = true;
+    //    PassTensorData(&out, &in);
+    //  }
+    //
+    //  PADDLE_ENFORCE(transformed, "No transform is applied, please
+    //  check!");
+    // get output data
+    output_tensor->ShareDataWith(in);
 }

-void CopyVariableWithTensor(const Variable &in_var,
-                            const Tensor &tensor, Variable &out_var) {
-  //  if (in_var.IsType<LoDTensor>()) {
-  //    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-  //    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
-  //    tran_lod_tensor->set_lod(in_lod_tensor.lod());
-  //    tran_lod_tensor->set_layout(in_lod_tensor.layout());
-  //    tran_lod_tensor->ShareDataWith(tensor);
-  //  } else if (in_var.IsType<SelectedRows>()) {
-  //    auto& in_selected_rows = in_var.Get<SelectedRows>();
-  //    auto* trans_selected_rows =
-  //    out_var.GetMutable<SelectedRows>();
-  //    trans_selected_rows->set_height(in_selected_rows.height());
-  //    trans_selected_rows->set_rows(in_selected_rows.rows());
-  //    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
-  //  } else {
-  //    PADDLE_THROW("unknown var type");
-  //  }
+void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
+                            Variable &out_var) {
+    //  if (in_var.IsType<LoDTensor>()) {
+    //    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+    //    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    //    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+    //    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+    //    tran_lod_tensor->ShareDataWith(tensor);
+    //  } else if (in_var.IsType<SelectedRows>()) {
+    //    auto& in_selected_rows = in_var.Get<SelectedRows>();
+    //    auto* trans_selected_rows =
+    //    out_var.GetMutable<SelectedRows>();
+    //    trans_selected_rows->set_height(in_selected_rows.height());
+    //    trans_selected_rows->set_rows(in_selected_rows.rows());
+    //    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+    //  } else {
+    //    PADDLE_THROW("unknown var type");
+    //  }
 }

 } // namespace framework

--- a/src/framework/data_transform.h
+++ b/src/framework/data_transform.h
@@ -28,14 +28,14 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        void DataTransform(const OpKernelType &expected_kernel_type,
-                           const OpKernelType &kernel_type_for_var,
-                           const Tensor &input_tensor, Tensor *out);
+void DataTransform(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *out);

-        void CopyVariableWithTensor(const Variable &in_var,
-                                    const Tensor &tensor, Variable &out_var);
+void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
+                            Variable &out_var);

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/data_type.h
+++ b/src/framework/data_type.h
@@ -21,23 +21,23 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        //    inline proto::VarType::Type ToDataType(std::type_index type) {
-        //        using namespace paddle_mobile::framework::proto;
-        //        if (typeid(float).hash_code() == type.hash_code()) {
-        //            return proto::VarType::FP32;
-        //        } else if (typeid(double).hash_code() == type.hash_code()) {
-        //            return proto::VarType::FP64;
-        //        } else if (typeid(int).hash_code() == type.hash_code()) {
-        //            return proto::VarType::INT32;
-        //        } else if (typeid(int64_t).hash_code() == type.hash_code()) {
-        //            return proto::VarType::INT64;
-        //        } else if (typeid(bool).hash_code() == type.hash_code()) {
-        //            return proto::VarType::BOOL;
-        //        } else {
-        ////            PADDLE_THROW("Not supported");
-        //        }
-        //    }
-    }
+//    inline proto::VarType::Type ToDataType(std::type_index type) {
+//        using namespace paddle_mobile::framework::proto;
+//        if (typeid(float).hash_code() == type.hash_code()) {
+//            return proto::VarType::FP32;
+//        } else if (typeid(double).hash_code() == type.hash_code()) {
+//            return proto::VarType::FP64;
+//        } else if (typeid(int).hash_code() == type.hash_code()) {
+//            return proto::VarType::INT32;
+//        } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+//            return proto::VarType::INT64;
+//        } else if (typeid(bool).hash_code() == type.hash_code()) {
+//            return proto::VarType::BOOL;
+//        } else {
+////            PADDLE_THROW("Not supported");
+//        }
+//    }
+}
 } // namespace paddle_mobile
--- a/src/framework/ddim.cc
+++ b/src/framework/ddim.cc
@@ -15,320 +15,318 @@ limitations under the License. */
 #include "ddim.h"

 namespace paddle_mobile {
-    namespace framework {
-
-        /// @cond HIDDEN
-
-        template <int i> Dim<i> make_dim(const int64_t *d) {
-            return Dim<i>(*d, make_dim<i - 1>(d + 1));
-        }
-
-        template <> Dim<0> make_dim<0>(const int64_t *d) { return Dim<0>(*d); }
-
-        void make_ddim(DDim &ddim, const int64_t *dims, int n) {
-            switch (n) {
-            case 0:
-                ddim = make_dim<0>(dims);
-                break;
-            case 1:
-                ddim = make_dim<1>(dims);
-                break;
-            case 2:
-                ddim = make_dim<2>(dims);
-                break;
-            case 3:
-                ddim = make_dim<3>(dims);
-                break;
-            case 4:
-                ddim = make_dim<4>(dims);
-                break;
-            case 5:
-                ddim = make_dim<5>(dims);
-                break;
-            case 6:
-                ddim = make_dim<6>(dims);
-                break;
-            case 7:
-                ddim = make_dim<7>(dims);
-                break;
-            case 8:
-                ddim = make_dim<8>(dims);
-                break;
-            case 9:
-                ddim = make_dim<9>(dims);
-                break;
-            default:
-                //      std::cout << "Dynamic dimensions must have between [1,
-                //      9]
-                //      dimensions.";
-                break;
-            }
-        }
-
-        /// @endcond
-
-        DDim make_ddim(std::initializer_list<int64_t> dims) {
-            DDim result(make_dim(0));
-            make_ddim(result, dims.begin(), dims.size());
-            return result;
-        }
-
-        DDim make_ddim(const std::vector<int64_t> &dims) {
-            DDim result(make_dim(0));
-            make_ddim(result, &dims[0], dims.size());
-            return result;
-        }
-
-        DDim make_ddim(const std::vector<int> &dims) {
-            std::vector<int64_t> res(dims.size());
-            std::transform(dims.begin(), dims.end(), res.begin(),
-                           [](int d) { return static_cast<int64_t>(d); });
-            return make_ddim(res);
-        }
-
-        /// @cond HIDDEN
-        // XXX For some reason, putting this in an anonymous namespace causes
-        // errors
-        struct DynamicMutableIndexer : Vistor<int64_t &> {
-          public:
-            explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
-
-            template <int D> int64_t &operator()(Dim<D> &dim) const {
-                return dim[idx_];
-            }
-
-          private:
-            int idx_;
-        };
-
-        struct DynamicConstIndexer : public Vistor<int64_t> {
-          public:
-            explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-            template <int D> int64_t operator()(const Dim<D> &dim) const {
-                return dim[idx_];
-            }
-
-          private:
-            int idx_;
-        };
-
-        /// @endcond
-
-        int64_t &DDim::operator[](int idx) {
-            return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this);
-        }
-
-        int64_t DDim::operator[](int idx) const {
-            return DDim::ApplyVistor(DynamicConstIndexer(idx), *this);
-        }
-
-        int DDim::size() const { return arity(*this); }
-
-        bool DDim::operator==(DDim d) const {
-            //  if (var.which() != d.getVar().which()) {
-            //    return false;
-            //  } else {
-            std::vector<int64_t> v1 = vectorize(*this);
-            std::vector<int64_t> v2 = vectorize(d);
-
-            for (unsigned int i = 0; i < v1.size(); i++) {
-                if (v1[i] != v2[i]) {
-                    return false;
-                }
-            }
-
-            return true;
-            //  }
-        }
-
-        bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-        DDim DDim::operator+(DDim d) const {
-            std::vector<int64_t> v1 = vectorize(*this);
-            std::vector<int64_t> v2 = vectorize(d);
-
-            std::vector<int64_t> v3;
-
-            assert(v1.size() == v2.size());
-
-            for (unsigned int i = 0; i < v1.size(); i++) {
-                v3.push_back(v1[i] + v2[i]);
-            }
-
-            return make_ddim(v3);
-        }
-
-        DDim DDim::operator*(DDim d) const {
-            std::vector<int64_t> v1 = vectorize(*this);
-            std::vector<int64_t> v2 = vectorize(d);
-
-            std::vector<int64_t> v3;
-
-            assert(v1.size() == v2.size());
-
-            for (unsigned int i = 0; i < v1.size(); i++) {
-                v3.push_back(v1[i] * v2[i]);
-            }
-
-            return make_ddim(v3);
+namespace framework {
+
+/// @cond HIDDEN
+
+template <int i> Dim<i> make_dim(const int64_t *d) {
+    return Dim<i>(*d, make_dim<i - 1>(d + 1));
+}
+
+template <> Dim<0> make_dim<0>(const int64_t *d) { return Dim<0>(*d); }
+
+void make_ddim(DDim &ddim, const int64_t *dims, int n) {
+    switch (n) {
+    case 0:
+        ddim = make_dim<0>(dims);
+        break;
+    case 1:
+        ddim = make_dim<1>(dims);
+        break;
+    case 2:
+        ddim = make_dim<2>(dims);
+        break;
+    case 3:
+        ddim = make_dim<3>(dims);
+        break;
+    case 4:
+        ddim = make_dim<4>(dims);
+        break;
+    case 5:
+        ddim = make_dim<5>(dims);
+        break;
+    case 6:
+        ddim = make_dim<6>(dims);
+        break;
+    case 7:
+        ddim = make_dim<7>(dims);
+        break;
+    case 8:
+        ddim = make_dim<8>(dims);
+        break;
+    case 9:
+        ddim = make_dim<9>(dims);
+        break;
+    default:
+        //      std::cout << "Dynamic dimensions must have between [1,
+        //      9]
+        //      dimensions.";
+        break;
+    }
+}
+
+/// @endcond
+
+DDim make_ddim(std::initializer_list<int64_t> dims) {
+    DDim result(make_dim(0));
+    make_ddim(result, dims.begin(), dims.size());
+    return result;
+}
+
+DDim make_ddim(const std::vector<int64_t> &dims) {
+    DDim result(make_dim(0));
+    make_ddim(result, &dims[0], dims.size());
+    return result;
+}
+
+DDim make_ddim(const std::vector<int> &dims) {
+    std::vector<int64_t> res(dims.size());
+    std::transform(dims.begin(), dims.end(), res.begin(),
+                   [](int d) { return static_cast<int64_t>(d); });
+    return make_ddim(res);
+}
+
+/// @cond HIDDEN
+// XXX For some reason, putting this in an anonymous namespace causes
+// errors
+struct DynamicMutableIndexer : Vistor<int64_t &> {
+  public:
+    explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+
+    template <int D> int64_t &operator()(Dim<D> &dim) const {
+        return dim[idx_];
+    }
+
+  private:
+    int idx_;
+};
+
+struct DynamicConstIndexer : public Vistor<int64_t> {
+  public:
+    explicit DynamicConstIndexer(int idx) : idx_(idx) {}
+
+    template <int D> int64_t operator()(const Dim<D> &dim) const {
+        return dim[idx_];
+    }
+
+  private:
+    int idx_;
+};
+
+/// @endcond
+
+int64_t &DDim::operator[](int idx) {
+    return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this);
+}
+
+int64_t DDim::operator[](int idx) const {
+    return DDim::ApplyVistor(DynamicConstIndexer(idx), *this);
+}
+
+int DDim::size() const { return arity(*this); }
+
+bool DDim::operator==(DDim d) const {
+    //  if (var.which() != d.getVar().which()) {
+    //    return false;
+    //  } else {
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+        if (v1[i] != v2[i]) {
+            return false;
        }
+    }

-        int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
-
-        void set(DDim &ddim, int idx, int value) { ddim[idx] = value; }
-
-        /// @cond HIDDEN
-        struct VectorizeVisitor : Vistor<void> {
-            std::vector<int64_t> &vector;
-
-            explicit VectorizeVisitor(std::vector<int64_t> &v) : vector(v) {}
-
-            template <typename T> void operator()(const T &t) {
-                vector.push_back(t.head);
-                this->operator()(t.tail);
-            }
-
-            void operator()(const Dim<0> &t) {}
-        };
-        /// @endcond
-
-        std::vector<int64_t> vectorize(const DDim &ddim) {
-            std::vector<int64_t> result;
-            VectorizeVisitor visitor(result);
-            DDim::ApplyVistor(visitor, ddim);
-            return result;
+    return true;
+    //  }
+}
+
+bool DDim::operator!=(DDim d) const { return !(*this == d); }
+
+DDim DDim::operator+(DDim d) const {
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
+
+    std::vector<int64_t> v3;
+
+    assert(v1.size() == v2.size());
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+        v3.push_back(v1[i] + v2[i]);
+    }
+
+    return make_ddim(v3);
+}
+
+DDim DDim::operator*(DDim d) const {
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
+
+    std::vector<int64_t> v3;
+
+    assert(v1.size() == v2.size());
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+        v3.push_back(v1[i] * v2[i]);
+    }
+
+    return make_ddim(v3);
+}
+
+int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
+
+void set(DDim &ddim, int idx, int value) { ddim[idx] = value; }
+
+/// @cond HIDDEN
+struct VectorizeVisitor : Vistor<void> {
+    std::vector<int64_t> &vector;
+
+    explicit VectorizeVisitor(std::vector<int64_t> &v) : vector(v) {}
+
+    template <typename T> void operator()(const T &t) {
+        vector.push_back(t.head);
+        this->operator()(t.tail);
+    }
+
+    void operator()(const Dim<0> &t) {}
+};
+/// @endcond
+
+std::vector<int64_t> vectorize(const DDim &ddim) {
+    std::vector<int64_t> result;
+    VectorizeVisitor visitor(result);
+    DDim::ApplyVistor(visitor, ddim);
+    return result;
+}
+
+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> vectorize2int(const DDim &ddim) {
+    std::vector<int64_t> temp = vectorize(ddim);
+    std::vector<int> result(temp.begin(), temp.end());
+    return result;
+}
+
+struct ProductVisitor : Vistor<int64_t> {
+    template <int D> int64_t operator()(const Dim<D> &dim) {
+        return product(dim);
+    }
+};
+
+int64_t product(const DDim &ddim) {
+    ProductVisitor visitor;
+    return DDim::ApplyVistor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : Vistor<void> {
+    std::vector<int64_t> &vector;
+    int begin;
+    int end;
+
+    SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
+        : vector(v), begin(b), end(e) {
+        //    PADDLE_ENFORCE(begin < end,
+        //                   "Begin index must be less than end index in
+        //                   ddim
+        //                   slice.");
+        //    PADDLE_ENFORCE(begin >= 0,
+        //                   "Begin index can't be less than zero in
+        //                   ddim slice.");
+    }
+
+    template <int S> void operator()(const Dim<S> &dim) {
+        if (begin == 0) {
+            vector.push_back(dim.head);
+        } else {
+            --begin;
        }
-
-        // NOTE: framework::vectorize converts to type int64_t
-        //       which does not fit cudnn inputs.
-        std::vector<int> vectorize2int(const DDim &ddim) {
-            std::vector<int64_t> temp = vectorize(ddim);
-            std::vector<int> result(temp.begin(), temp.end());
-            return result;
+        --end;
+        if (end > 0) {
+            this->operator()(dim.tail);
        }
-
-        struct ProductVisitor : Vistor<int64_t> {
-            template <int D> int64_t operator()(const Dim<D> &dim) {
-                return product(dim);
-            }
-        };
-
-        int64_t product(const DDim &ddim) {
-            ProductVisitor visitor;
-            return DDim::ApplyVistor(visitor, ddim);
-        }
-
-        struct SliceVectorizeVisitor : Vistor<void> {
-            std::vector<int64_t> &vector;
-            int begin;
-            int end;
-
-            SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
-                : vector(v), begin(b), end(e) {
-                //    PADDLE_ENFORCE(begin < end,
-                //                   "Begin index must be less than end index in
-                //                   ddim
-                //                   slice.");
-                //    PADDLE_ENFORCE(begin >= 0,
-                //                   "Begin index can't be less than zero in
-                //                   ddim slice.");
-            }
-
-            template <int S> void operator()(const Dim<S> &dim) {
-                if (begin == 0) {
-                    vector.push_back(dim.head);
-                } else {
-                    --begin;
-                }
-                --end;
-                if (end > 0) {
-                    this->operator()(dim.tail);
-                }
-            }
-
-            void operator()(const Dim<0> &dim) {
-                //    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out
-                //    of bound.");
-            }
-        };
-
-        DDim slice_ddim(const DDim &ddim, int begin, int end) {
-            std::vector<int64_t> vec;
-            vec.reserve(end - begin);
-            SliceVectorizeVisitor visitor(vec, begin, end);
-            //  boost::apply_visitor(visitor, dim);
-            DDim::ApplyVistor(visitor, ddim);
-            //  visitor(ddim.var.Get<Dim<4>>());
-            return make_ddim(vec);
-        }
-
-        /// \cond HIDDEN
-
-        struct ArityVisitor : Vistor<int> {
-            template <int D> int operator()(Dim<D>) const { return D; }
-        };
-
-        /// \endcond
-
-        int arity(const DDim &d) {
-            ArityVisitor arityVisitor = ArityVisitor();
-            return DDim::ApplyVistor(arityVisitor, d);
-            //  return arityVisitor(d.var.Get<Dim<4>>());
-            //  return boost::apply_visitor(ArityVisitor(), d); }
-        }
-        /// \cond HIDDEN
-
-        /// \endcond
-
-        struct OSVistor : Vistor<std::ostream &> {
-            OSVistor(std::ostream &os) : os_(os) {}
-
-            template <int D> std::ostream &operator()(Dim<D> dim) const {
-                return os_ << dim;
-            }
-
-          private:
-            std::ostream &os_;
-        };
-
-        std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
-            auto vistor = OSVistor(os);
-            DDim::ApplyVistor(vistor, ddim);
-            return os;
-        }
-
-        DDim::DDim(std::initializer_list<int64_t> init_list) {
-            *this = make_ddim(init_list);
-        }
-
-        DDim flatten_to_2d(const DDim &src, int num_col_dims) {
-            int rank = src.size();
-            return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                              product(slice_ddim(src, num_col_dims, rank))});
-        }
-
-        DDim flatten_to_1d(const DDim &src) {
-            return make_ddim({product(src)});
-        }
-
-        DDim stride(const DDim &ddim) {
-            std::vector<int64_t> strides(ddim.size());
-            strides[ddim.size() - 1] = 1;
-            for (int i = ddim.size() - 2; i >= 0; --i) {
-                strides[i] = strides[i + 1] * ddim[i + 1];
-            }
-            return framework::make_ddim(strides);
-        }
-
-        DDim stride_numel(const framework::DDim &ddim) {
-            std::vector<int64_t> strides(ddim.size());
-            strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-            for (int i = ddim.size() - 2; i >= 0; --i) {
-                strides[i] = strides[i + 1] * ddim[i];
-            }
-            return framework::make_ddim(strides);
-        }
-
-    } // namespace framework
+    }
+
+    void operator()(const Dim<0> &dim) {
+        //    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out
+        //    of bound.");
+    }
+};
+
+DDim slice_ddim(const DDim &ddim, int begin, int end) {
+    std::vector<int64_t> vec;
+    vec.reserve(end - begin);
+    SliceVectorizeVisitor visitor(vec, begin, end);
+    //  boost::apply_visitor(visitor, dim);
+    DDim::ApplyVistor(visitor, ddim);
+    //  visitor(ddim.var.Get<Dim<4>>());
+    return make_ddim(vec);
+}
+
+/// \cond HIDDEN
+
+struct ArityVisitor : Vistor<int> {
+    template <int D> int operator()(Dim<D>) const { return D; }
+};
+
+/// \endcond
+
+int arity(const DDim &d) {
+    ArityVisitor arityVisitor = ArityVisitor();
+    return DDim::ApplyVistor(arityVisitor, d);
+    //  return arityVisitor(d.var.Get<Dim<4>>());
+    //  return boost::apply_visitor(ArityVisitor(), d); }
+}
+/// \cond HIDDEN
+
+/// \endcond
+
+struct OSVistor : Vistor<std::ostream &> {
+    OSVistor(std::ostream &os) : os_(os) {}
+
+    template <int D> std::ostream &operator()(Dim<D> dim) const {
+        return os_ << dim;
+    }
+
+  private:
+    std::ostream &os_;
+};
+
+std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
+    auto vistor = OSVistor(os);
+    DDim::ApplyVistor(vistor, ddim);
+    return os;
+}
+
+DDim::DDim(std::initializer_list<int64_t> init_list) {
+    *this = make_ddim(init_list);
+}
+
+DDim flatten_to_2d(const DDim &src, int num_col_dims) {
+    int rank = src.size();
+    return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+                      product(slice_ddim(src, num_col_dims, rank))});
+}
+
+DDim flatten_to_1d(const DDim &src) { return make_ddim({product(src)}); }
+
+DDim stride(const DDim &ddim) {
+    std::vector<int64_t> strides(ddim.size());
+    strides[ddim.size() - 1] = 1;
+    for (int i = ddim.size() - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * ddim[i + 1];
+    }
+    return framework::make_ddim(strides);
+}
+
+DDim stride_numel(const framework::DDim &ddim) {
+    std::vector<int64_t> strides(ddim.size());
+    strides[ddim.size() - 1] = ddim[ddim.size() - 1];
+    for (int i = ddim.size() - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * ddim[i];
+    }
+    return framework::make_ddim(strides);
+}
+
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -22,145 +22,142 @@ limitations under the License. */
 #include <vector>

 namespace paddle_mobile {
-    namespace framework {
-
-        /**
-         * \brief A dynamically sized dimension.
-         *
-         * The number of dimensions must be between [1, 9].
-         */
-        struct DDim {
-            typedef Variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
-                            Dim<6>, Dim<7>, Dim<8>, Dim<9>>
-                DDimVar;
-            DDimVar var;
-
-            template <typename Vistor>
-            static typename Vistor::type_t ApplyVistor(Vistor vistor,
-                                                       const DDim &d) {
-                if (d.var.TypeId() == typeid(Dim<0>).hash_code()) {
-                    return vistor(d.var.Get<Dim<0>>());
-                } else if (d.var.TypeId() == typeid(Dim<1>).hash_code()) {
-                    return vistor(d.var.Get<Dim<1>>());
-                } else if (d.var.TypeId() == typeid(Dim<2>).hash_code()) {
-                    return vistor(d.var.Get<Dim<2>>());
-                } else if (d.var.TypeId() == typeid(Dim<3>).hash_code()) {
-                    return vistor(d.var.Get<Dim<3>>());
-                } else if (d.var.TypeId() == typeid(Dim<4>).hash_code()) {
-                    return vistor(d.var.Get<Dim<4>>());
-                } else if (d.var.TypeId() == typeid(Dim<5>).hash_code()) {
-                    return vistor(d.var.Get<Dim<5>>());
-                } else if (d.var.TypeId() == typeid(Dim<6>).hash_code()) {
-                    return vistor(d.var.Get<Dim<6>>());
-                } else if (d.var.TypeId() == typeid(Dim<7>).hash_code()) {
-                    return vistor(d.var.Get<Dim<7>>());
-                } else if (d.var.TypeId() == typeid(Dim<8>).hash_code()) {
-                    return vistor(d.var.Get<Dim<8>>());
-                } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
-                    return vistor(d.var.Get<Dim<9>>());
-                } else {
-                    printf(" dim not support  \n");
-                    throw std::bad_exception();
-                    //        return typename Vistor::type_t();
-                }
-            }
-
-            DDim() { var.Set<Dim<1>>(Dim<1>()); }
-
-            template <int D> explicit DDim(const Dim<D> &in) {
-                var.Set<Dim<D>>(in);
-            }
-
-            /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
-
-            template <int D> DDim &operator=(const Dim<D> &in) {
-                var.Set<Dim<D>>(in);
-                return *this;
-            }
-
-            int64_t &operator[](int idx);
-
-            int64_t operator[](int idx) const;
-
-            //  template <typename Visitor>
-            //  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-            //    return var.apply_visitor(visitor);
-            //  }
-            //
-            //  template <typename Visitor>
-            //  typename Visitor::result_type apply_visitor(Visitor& visitor)
-            //  const {
-            //    return var.apply_visitor(visitor);
-            //  }
-
-            DDimVar getVar() { return var; }
-
-            bool operator==(DDim d) const;
-
-            bool operator!=(DDim d) const;
-
-            DDim operator+(DDim d) const;
-
-            DDim operator*(DDim d) const;
-
-            int size() const;
-        };
-
-        /**
-         * \brief Make a DDim from std::vector<int64_t>
-         *
-         * \param dims An vector of ints. Must be sized between [1, 9]
-         */
-        DDim make_ddim(const std::vector<int64_t> &dims);
-
-        DDim make_ddim(const std::vector<int> &dims);
-
-        /**
-         * \brief Make a DDim from an initializer list
-         *
-         * \param dims An initializer list of ints. Must be sized between [1, 9]
-         *
-         */
-        DDim make_ddim(std::initializer_list<int64_t> dims);
-
-        int64_t get(const DDim &dim, int idx);
-
-        void set(DDim &dim, int idx, int val);
-
-        std::vector<int64_t> vectorize(const DDim &ddim);
-
-        std::vector<int> vectorize2int(const DDim &ddim);
-
-        int64_t product(const DDim &ddim);
-
-        /**
-         * \brief Slice a ddim
-         *
-         * Slice dim with [begin, end).
-         * e.g.  DDim d = make_ddim({1,2,3,4,5});
-         *       slice_ddim(d, 1, 3); ====> {2,3}
-         */
-        DDim slice_ddim(const DDim &dim, int begin, int end);
-
-        /**
-         * \brief What is the length of this dimension?
-         *
-         * \param Dynamic dimension to inspect
-         */
+namespace framework {
+
+/**
+ * \brief A dynamically sized dimension.
+ *
+ * The number of dimensions must be between [1, 9].
+ */
+struct DDim {
+    typedef Variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+                    Dim<7>, Dim<8>, Dim<9>>
+        DDimVar;
+    DDimVar var;
+
+    template <typename Vistor>
+    static typename Vistor::type_t ApplyVistor(Vistor vistor, const DDim &d) {
+        if (d.var.TypeId() == typeid(Dim<0>).hash_code()) {
+            return vistor(d.var.Get<Dim<0>>());
+        } else if (d.var.TypeId() == typeid(Dim<1>).hash_code()) {
+            return vistor(d.var.Get<Dim<1>>());
+        } else if (d.var.TypeId() == typeid(Dim<2>).hash_code()) {
+            return vistor(d.var.Get<Dim<2>>());
+        } else if (d.var.TypeId() == typeid(Dim<3>).hash_code()) {
+            return vistor(d.var.Get<Dim<3>>());
+        } else if (d.var.TypeId() == typeid(Dim<4>).hash_code()) {
+            return vistor(d.var.Get<Dim<4>>());
+        } else if (d.var.TypeId() == typeid(Dim<5>).hash_code()) {
+            return vistor(d.var.Get<Dim<5>>());
+        } else if (d.var.TypeId() == typeid(Dim<6>).hash_code()) {
+            return vistor(d.var.Get<Dim<6>>());
+        } else if (d.var.TypeId() == typeid(Dim<7>).hash_code()) {
+            return vistor(d.var.Get<Dim<7>>());
+        } else if (d.var.TypeId() == typeid(Dim<8>).hash_code()) {
+            return vistor(d.var.Get<Dim<8>>());
+        } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
+            return vistor(d.var.Get<Dim<9>>());
+        } else {
+            printf(" dim not support  \n");
+            throw std::bad_exception();
+            //        return typename Vistor::type_t();
+        }
+    }
+
+    DDim() { var.Set<Dim<1>>(Dim<1>()); }
+
+    template <int D> explicit DDim(const Dim<D> &in) { var.Set<Dim<D>>(in); }
+
+    /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+
+    template <int D> DDim &operator=(const Dim<D> &in) {
+        var.Set<Dim<D>>(in);
+        return *this;
+    }
+
+    int64_t &operator[](int idx);
+
+    int64_t operator[](int idx) const;
+
+    //  template <typename Visitor>
+    //  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+    //    return var.apply_visitor(visitor);
+    //  }
+    //
+    //  template <typename Visitor>
+    //  typename Visitor::result_type apply_visitor(Visitor& visitor)
+    //  const {
+    //    return var.apply_visitor(visitor);
+    //  }
+
+    DDimVar getVar() { return var; }
+
+    bool operator==(DDim d) const;
+
+    bool operator!=(DDim d) const;
+
+    DDim operator+(DDim d) const;
+
+    DDim operator*(DDim d) const;
+
+    int size() const;
+};
+
+/**
+ * \brief Make a DDim from std::vector<int64_t>
+ *
+ * \param dims An vector of ints. Must be sized between [1, 9]
+ */
+DDim make_ddim(const std::vector<int64_t> &dims);
+
+DDim make_ddim(const std::vector<int> &dims);
+
+/**
+ * \brief Make a DDim from an initializer list
+ *
+ * \param dims An initializer list of ints. Must be sized between [1, 9]
+ *
+ */
+DDim make_ddim(std::initializer_list<int64_t> dims);
+
+int64_t get(const DDim &dim, int idx);
+
+void set(DDim &dim, int idx, int val);
+
+std::vector<int64_t> vectorize(const DDim &ddim);
+
+std::vector<int> vectorize2int(const DDim &ddim);
+
+int64_t product(const DDim &ddim);
+
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim &dim, int begin, int end);
+
+/**
+ * \brief What is the length of this dimension?
+ *
+ * \param Dynamic dimension to inspect
+ */

-        int arity(const DDim &ddim);
+int arity(const DDim &ddim);

-        std::ostream &operator<<(std::ostream &, const DDim &);
+std::ostream &operator<<(std::ostream &, const DDim &);

-        // Reshape a tensor to a matrix. The matrix's first dimension(column
-        // length)
-        // will be the product of tensor's first `num_col_dims` dimensions.
-        DDim flatten_to_2d(const DDim &src, int num_col_dims);
+// Reshape a tensor to a matrix. The matrix's first dimension(column
+// length)
+// will be the product of tensor's first `num_col_dims` dimensions.
+DDim flatten_to_2d(const DDim &src, int num_col_dims);

-        DDim flatten_to_1d(const DDim &src);
+DDim flatten_to_1d(const DDim &src);

-        DDim stride(const DDim &ddim);
+DDim stride(const DDim &ddim);

-        DDim stride_numel(const DDim &ddim);
-    } // namespace framework
+DDim stride_numel(const DDim &ddim);
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -21,410 +21,392 @@
 #include "platform/hostdevice.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        // Statically sized, statically indexed dimension
-        template <int i> struct Dim {
-            static constexpr int dimensions = i;
+// Statically sized, statically indexed dimension
+template <int i> struct Dim {
+    static constexpr int dimensions = i;

-            template <typename... Args>
-            HOSTDEVICE Dim(int64_t _head, Args... _tail)
-                : head(_head), tail(_tail...) {
-                static_assert(
-                    sizeof...(_tail) == i - 1,
-                    "Dim initialized with the wrong number of parameters");
-            }
+    template <typename... Args>
+    HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
+        static_assert(sizeof...(_tail) == i - 1,
+                      "Dim initialized with the wrong number of parameters");
+    }

-            HOSTDEVICE
-            Dim(int64_t _head, const Dim<i - 1> &_tail)
-                : head(_head), tail(_tail) {}
+    HOSTDEVICE
+    Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}

-            HOSTDEVICE
-            Dim() : head(0), tail() {}
+    HOSTDEVICE
+    Dim() : head(0), tail() {}

-            /** Construct a Dim from a linear index and size.  Uses Fortran
-             * order
-             * indexing. */
-            HOSTDEVICE
-            Dim(int64_t idx, const Dim<i> &size)
-                : head(idx % size.head), tail(idx / size.head, size.tail) {}
+    /** Construct a Dim from a linear index and size.  Uses Fortran
+     * order
+     * indexing. */
+    HOSTDEVICE
+    Dim(int64_t idx, const Dim<i> &size)
+        : head(idx % size.head), tail(idx / size.head, size.tail) {}

-            /** Construct a Dim with each dimension set to the given index */
-            HOSTDEVICE
-            Dim(int64_t idx) : head(idx), tail(idx) {}
+    /** Construct a Dim with each dimension set to the given index */
+    HOSTDEVICE
+    Dim(int64_t idx) : head(idx), tail(idx) {}

-            HOSTDEVICE
-            bool operator==(const Dim<i> &o) const {
-                return (head == o.head) && (tail == o.tail);
-            }
+    HOSTDEVICE
+    bool operator==(const Dim<i> &o) const {
+        return (head == o.head) && (tail == o.tail);
+    }

-            HOSTDEVICE
-            bool operator!=(const Dim<i> &o) const { return !(*this == o); }
+    HOSTDEVICE
+    bool operator!=(const Dim<i> &o) const { return !(*this == o); }

-            HOSTDEVICE
-            int64_t &operator[](int idx);
-            HOSTDEVICE
-            int64_t operator[](int idx) const;
+    HOSTDEVICE
+    int64_t &operator[](int idx);
+    HOSTDEVICE
+    int64_t operator[](int idx) const;

-            HOST std::string to_string() const;
+    HOST std::string to_string() const;

-            int64_t head;
-            Dim<i - 1> tail;
-        };
+    int64_t head;
+    Dim<i - 1> tail;
+};

-        // Base case specialization
-        template <> struct Dim<0> {
-            static constexpr int dimensions = 0;
+// Base case specialization
+template <> struct Dim<0> {
+    static constexpr int dimensions = 0;

-            HOSTDEVICE
-            Dim(int64_t _head) {}
+    HOSTDEVICE
+    Dim(int64_t _head) {}

-            HOSTDEVICE
-            Dim() {}
+    HOSTDEVICE
+    Dim() {}

-            HOSTDEVICE
-            Dim(int idx, const Dim<0> &size) {
+    HOSTDEVICE
+    Dim(int idx, const Dim<0> &size) {
 #ifndef __CUDA_ARCH__
-                if (idx > 0) {
-                    throw std::invalid_argument("Index out of range.");
-                }
+        if (idx > 0) {
+            throw std::invalid_argument("Index out of range.");
+        }
 #else
-                PADDLE_ASSERT(idx == 0);
+        PADDLE_ASSERT(idx == 0);
 #endif
-            }
-
-            HOSTDEVICE
-            bool operator==(const Dim<0> &o) const { return true; }
-
-            HOSTDEVICE
-            bool operator!=(const Dim<0> &o) const { return false; }
-
-            HOSTDEVICE
-            int64_t &operator[](int idx);
-            HOSTDEVICE
-            int64_t operator[](int idx) const;
-        };
-
-        namespace {
-
-            // Helper for accessing Dim classes
-            template <int i> struct DimGetter {
-                // Return a copy if Dim is const
-                template <typename D>
-                HOSTDEVICE static int64_t impl(const D &d) {
-                    return DimGetter<i - 1>::impl(d.tail);
-                }
-                // Return a reference if Dim is mutable
-                template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
-                    return DimGetter<i - 1>::impl(d.tail);
-                }
-            };
-
-            // Eureka! We found the element!
-            template <> struct DimGetter<0> {
-                // Return a copy if Dim is const
-                template <typename D>
-                HOSTDEVICE static int64_t impl(const D &d) {
-                    return d.head;
-                }
-                // Return a reference if Dim is mutable
-                template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
-                    return d.head;
-                }
-            };
-
-            template <int D> HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
+    }
+
+    HOSTDEVICE
+    bool operator==(const Dim<0> &o) const { return true; }
+
+    HOSTDEVICE
+    bool operator!=(const Dim<0> &o) const { return false; }
+
+    HOSTDEVICE
+    int64_t &operator[](int idx);
+    HOSTDEVICE
+    int64_t operator[](int idx) const;
+};
+
+namespace {
+
+// Helper for accessing Dim classes
+template <int i> struct DimGetter {
+    // Return a copy if Dim is const
+    template <typename D> HOSTDEVICE static int64_t impl(const D &d) {
+        return DimGetter<i - 1>::impl(d.tail);
+    }
+    // Return a reference if Dim is mutable
+    template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
+        return DimGetter<i - 1>::impl(d.tail);
+    }
+};
+
+// Eureka! We found the element!
+template <> struct DimGetter<0> {
+    // Return a copy if Dim is const
+    template <typename D> HOSTDEVICE static int64_t impl(const D &d) {
+        return d.head;
+    }
+    // Return a reference if Dim is mutable
+    template <typename D> HOSTDEVICE static int64_t &impl(D &d) {
+        return d.head;
+    }
+};
+
+template <int D> HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-                if (idx < 0) {
-                    throw std::invalid_argument(
-                        "Tried to access a negative dimension");
-                }
+    if (idx < 0) {
+        throw std::invalid_argument("Tried to access a negative dimension");
+    }
 #else
-                PADDLE_ASSERT(idx >= 0);
+    PADDLE_ASSERT(idx >= 0);
 #endif
-                if (idx == 0) {
-                    return dim.head;
-                }
-                return indexer(dim.tail, idx - 1);
-            }
+    if (idx == 0) {
+        return dim.head;
+    }
+    return indexer(dim.tail, idx - 1);
+}

-            template <> HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) {
+template <> HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-                throw std::invalid_argument("Invalid index");
+    throw std::invalid_argument("Invalid index");
 #else
-                PADDLE_ASSERT(false);
+    PADDLE_ASSERT(false);
 #if CUDA_VERSION < 8000
-                // On CUDA versions previous to 8.0, only __shared__ variables
-                // could be declared as static in the device code.
-                int64_t head = 0;
+    // On CUDA versions previous to 8.0, only __shared__ variables
+    // could be declared as static in the device code.
+    int64_t head = 0;
 #else
-                static int64_t head = 0;
+    static int64_t head = 0;
 #endif
-                return head;
+    return head;
 #endif
-            }
+}

-            template <int D>
-            HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
+template <int D> HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-                if (idx < 0) {
-                    throw std::invalid_argument(
-                        "Tried to access a negative dimension");
-                }
+    if (idx < 0) {
+        throw std::invalid_argument("Tried to access a negative dimension");
+    }
 #else
-                PADDLE_ASSERT(idx >= 0);
+    PADDLE_ASSERT(idx >= 0);
 #endif
-                if (idx == 0) {
-                    return dim.head;
-                }
-                return indexer(dim.tail, idx - 1);
-            }
-
-            template <>
-            HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) {
+    if (idx == 0) {
+        return dim.head;
+    }
+    return indexer(dim.tail, idx - 1);
+}
+
+template <> HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) {
 #ifndef __CUDA_ARCH__
-                throw std::invalid_argument("Invalid index");
+    throw std::invalid_argument("Invalid index");
 #else
-                PADDLE_ASSERT(false);
+    PADDLE_ASSERT(false);
 #if CUDA_VERSION < 8000
-                // On CUDA versions previous to 8.0, only __shared__ variables
-                // could be declared as static in the device code.
-                int64_t head = 0;
+    // On CUDA versions previous to 8.0, only __shared__ variables
+    // could be declared as static in the device code.
+    int64_t head = 0;
 #else
-                static int64_t head = 0;
+    static int64_t head = 0;
 #endif
-                return head;
+    return head;
 #endif
-            }
-
-        } // namespace
-        // Static access to constant Dim
-        template <int i, int l> HOSTDEVICE int64_t get(const Dim<l> &d) {
-            return DimGetter<i>::impl(d);
-        }
-
-        // Static access to mutable Dim
-        template <int i, int l> HOSTDEVICE int64_t &get(Dim<l> &d) {
-            return DimGetter<i>::impl(d);
-        }
-
-        // Dynamic access to constant Dim
-        template <int l> HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
-            //  std::cout << "l: " << l << std::endl;
-            return indexer(*this, i);
-        }
-
-        // Dynamic access to mutable Dim
-        template <int l> HOSTDEVICE int64_t &Dim<l>::operator[](int i) {
-            return indexer(*this, i);
-        }
-
-        // Dynamic access to constant Dim
-        inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-            return indexer(*this, i);
-        }
-
-        // Dynamic access to mutable Dim
-        inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) {
-            return indexer(*this, i);
-        }
-
-        // Dynamic access to constant Dim
-        // without std::enable_if will try to instantiate this on get<0>(d)
-        template <int l>
-        HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type
-        get(const Dim<l> &d, int i) {
-            return d[i];
-        }
-
-        // Dynamic access to mutable Dim
-        template <int l>
-        HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type
-        get(Dim<l> &d, int i) {
-            return d[i];
-        }
-
-        // Dot product of two dims
-        template <int i>
-        HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
-            return a.head * b.head + linearize(a.tail, b.tail);
-        }
-
-        // Base case dot product of two Dims
-        // Notice it is inline because it is no longer a template
-        template <>
-        HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
-            return 0;
-        }
-
-        // Product of a Dim
-        template <int i>
-        HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) {
-            return prod * a.head * product(a.tail);
-        }
-
-        // Base case product of a Dim
-        // Notice it is inline because it is no longer a template
-        template <>
-        HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) {
-            return prod;
-        }
-
-        // Is 0 <= idx_i < size_i for all i?
-        template <int i>
-        HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
-            return ((0 <= idx.head) && (idx.head < size.head) &&
-                    contained(idx.tail, size.tail));
-        }
-
-        // Base case of is 0 <= idx_i < size_i ?
-        // Notice it is inline because it is no longer a template
-        template <>
-        HOSTDEVICE inline bool contained(const Dim<0> &idx,
-                                         const Dim<0> &size) {
-            return true;
-        }
-
-        /**
-         * \brief Compute exclusive prefix-multiply of a Dim.
-         */
-        template <int i>
-        HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
-            return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-        }
-
-        ///\cond HIDDEN
-        // Base case of ex_prefix_mul
-        // Notice it is inline because it is no longer a template
-        template <>
-        HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
-            return Dim<0>();
-        }
-        ///\endcond
-
-        /**
-         * Add two dimensions together
-         */
-        template <int i>
-        HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
-            return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-        }
-
-        // Base case
-        template <>
-        HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
-            return Dim<0>();
-        }
-
-        template <int i>
-        HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
-            return dim_plus(lhs, rhs);
-        }
-
-        /**
-         * Multiply two dimensions together
-         */
-        template <int i>
-        HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
-            return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-        }
-
-        // Base case
-        template <>
-        HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
-            return Dim<0>();
-        }
-
-        template <int i>
-        HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
-            return dim_mult(lhs, rhs);
-        }
-
-        /**
-         * \brief Normalize strides to ensure any dimension with extent 1
-         * has stride 0.
-         *
-         * \param size Dim object containing the size of an array
-         * \param stride Dim object containing stride of an array
-         * \return Dim object the same size as \p size with normalized strides
-         *
-         */
-
-        template <int i>
-        HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size,
-                                            const Dim<i> &stride) {
-            int norm_stride = size.head == 1 ? 0 : stride.head;
-            return Dim<i>(norm_stride,
-                          normalize_strides(size.tail, stride.tail));
-        }
-
-        ///\cond HIDDEN
-
-        template <>
-        HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
-                                                   const Dim<0> &stride) {
-            return Dim<0>();
-        }
-
-        ///\endcond
-
-        /**
-         * Helper function to create a Dim
-         *
-         * \param idxes The type of Dim constructed depends on the number of
-         * params
-         *
-         */
-
-        template <typename... Args>
-        HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
-            return Dim<sizeof...(Args)>(idxes...);
-        }
-
-        // Allows us to output a Dim
-        // XXX For some reason, overloading fails to resolve this correctly
-        template <int i>
-        typename std::enable_if<(i > 1), std::ostream &>::type
-        operator<<(std::ostream &os, const Dim<i> &d) {
-            os << d.head << ", " << d.tail;
-            return os;
-        }
-
-        // Base case that allows us to output a Dim
-        // XXX I wish this could be an overload instead of a template
-        template <int i>
-        typename std::enable_if<(i == 1), std::ostream &>::type
-        operator<<(std::ostream &os, const Dim<i> &d) {
-            os << d.head;
-            return os;
-        }
-
-        inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
-            return os;
-        }
-
-        template <int i> HOST std::string Dim<i>::to_string() const {
-            std::stringstream stream;
-
-            stream << *this;
-
-            return stream.str();
-        }
-
-        template <int D>
-        HOSTDEVICE Dim<D> linear_to_dimension(int linear_index,
-                                              Dim<D> extents) {
-            Dim<D> result;
-
-            for (int i = 0; i < D - 1; ++i) {
-                result[i] = linear_index % extents[i];
-                linear_index /= extents[i];
-            }
-
-            result[D - 1] = linear_index;
-
-            return result;
-        }
-
-    } // namespace framework
+}
+
+} // namespace
+// Static access to constant Dim
+template <int i, int l> HOSTDEVICE int64_t get(const Dim<l> &d) {
+    return DimGetter<i>::impl(d);
+}
+
+// Static access to mutable Dim
+template <int i, int l> HOSTDEVICE int64_t &get(Dim<l> &d) {
+    return DimGetter<i>::impl(d);
+}
+
+// Dynamic access to constant Dim
+template <int l> HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
+    //  std::cout << "l: " << l << std::endl;
+    return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+template <int l> HOSTDEVICE int64_t &Dim<l>::operator[](int i) {
+    return indexer(*this, i);
+}
+
+// Dynamic access to constant Dim
+inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
+    return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) {
+    return indexer(*this, i);
+}
+
+// Dynamic access to constant Dim
+// without std::enable_if will try to instantiate this on get<0>(d)
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d,
+                                                               int i) {
+    return d[i];
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d,
+                                                                 int i) {
+    return d[i];
+}
+
+// Dot product of two dims
+template <int i>
+HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
+    return a.head * b.head + linearize(a.tail, b.tail);
+}
+
+// Base case dot product of two Dims
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
+    return 0;
+}
+
+// Product of a Dim
+template <int i> HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) {
+    return prod * a.head * product(a.tail);
+}
+
+// Base case product of a Dim
+// Notice it is inline because it is no longer a template
+template <> HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) {
+    return prod;
+}
+
+// Is 0 <= idx_i < size_i for all i?
+template <int i>
+HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
+    return ((0 <= idx.head) && (idx.head < size.head) &&
+            contained(idx.tail, size.tail));
+}
+
+// Base case of is 0 <= idx_i < size_i ?
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
+    return true;
+}
+
+/**
+ * \brief Compute exclusive prefix-multiply of a Dim.
+ */
+template <int i>
+HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
+    return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
+}
+
+///\cond HIDDEN
+// Base case of ex_prefix_mul
+// Notice it is inline because it is no longer a template
+template <> HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
+    return Dim<0>();
+}
+///\endcond
+
+/**
+ * Add two dimensions together
+ */
+template <int i> HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
+    return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
+    return Dim<0>();
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
+    return dim_plus(lhs, rhs);
+}
+
+/**
+ * Multiply two dimensions together
+ */
+template <int i> HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
+    return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
+    return Dim<0>();
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
+    return dim_mult(lhs, rhs);
+}
+
+/**
+ * \brief Normalize strides to ensure any dimension with extent 1
+ * has stride 0.
+ *
+ * \param size Dim object containing the size of an array
+ * \param stride Dim object containing stride of an array
+ * \return Dim object the same size as \p size with normalized strides
+ *
+ */
+
+template <int i>
+HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
+    int norm_stride = size.head == 1 ? 0 : stride.head;
+    return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
+}
+
+///\cond HIDDEN
+
+template <>
+HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
+                                           const Dim<0> &stride) {
+    return Dim<0>();
+}
+
+///\endcond
+
+/**
+ * Helper function to create a Dim
+ *
+ * \param idxes The type of Dim constructed depends on the number of
+ * params
+ *
+ */
+
+template <typename... Args>
+HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+    return Dim<sizeof...(Args)>(idxes...);
+}
+
+// Allows us to output a Dim
+// XXX For some reason, overloading fails to resolve this correctly
+template <int i>
+typename std::enable_if<(i > 1), std::ostream &>::type
+operator<<(std::ostream &os, const Dim<i> &d) {
+    os << d.head << ", " << d.tail;
+    return os;
+}
+
+// Base case that allows us to output a Dim
+// XXX I wish this could be an overload instead of a template
+template <int i>
+typename std::enable_if<(i == 1), std::ostream &>::type
+operator<<(std::ostream &os, const Dim<i> &d) {
+    os << d.head;
+    return os;
+}
+
+inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
+    return os;
+}
+
+template <int i> HOST std::string Dim<i>::to_string() const {
+    std::stringstream stream;
+
+    stream << *this;
+
+    return stream.str();
+}
+
+template <int D>
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+    Dim<D> result;
+
+    for (int i = 0; i < D - 1; ++i) {
+        result[i] = linear_index % extents[i];
+        linear_index /= extents[i];
+    }
+
+    result[D - 1] = linear_index;
+
+    return result;
+}
+
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -23,75 +23,72 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        template <typename Dtype>
-        Executor<Dtype>::Executor(const Program<Dtype> p) : program_(p) {
-            if (use_optimize_) {
-                to_predict_program_ = program_.optimizeProgram;
-            } else {
-                to_predict_program_ = program_.originProgram;
-            }
+template <typename Dtype>
+Executor<Dtype>::Executor(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+        to_predict_program_ = program_.optimizeProgram;
+    } else {
+        to_predict_program_ = program_.originProgram;
+    }

-            const std::vector<std::shared_ptr<BlockDesc>> blocks =
-                to_predict_program_->Blocks();
-            for (int i = 0; i < blocks.size(); ++i) {
-                std::shared_ptr<BlockDesc> block_desc = blocks[i];
-                std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-                for (int j = 0; j < ops.size(); ++j) {
-                    std::shared_ptr<OpDesc> op = ops[j];
-                    if (op->Type() == "conv2d" &&
-                        op->Input("Input")[0] == "pixel") {
-                        Attribute strides_attr = op->GetAttrMap().at("strides");
-                        std::vector<int> stride =
-                            strides_attr.Get<std::vector<int>>();
-                        for (int k = 0; k < stride.size(); ++k) {
-                        }
-                        std::shared_ptr<operators::ConvOp<Dtype, float>> conv =
-                            std::make_shared<operators::ConvOp<Dtype, float>>(
-                                op->Type(), op->GetInputs(), op->GetOutputs(),
-                                op->GetAttrMap(), program_.scope);
-                        ops_of_block_[*block_desc.get()].push_back(conv);
-                    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (int i = 0; i < blocks.size(); ++i) {
+        std::shared_ptr<BlockDesc> block_desc = blocks[i];
+        std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+        for (int j = 0; j < ops.size(); ++j) {
+            std::shared_ptr<OpDesc> op = ops[j];
+            if (op->Type() == "conv2d" && op->Input("Input")[0] == "pixel") {
+                Attribute strides_attr = op->GetAttrMap().at("strides");
+                std::vector<int> stride = strides_attr.Get<std::vector<int>>();
+                for (int k = 0; k < stride.size(); ++k) {
                }
+                std::shared_ptr<operators::ConvOp<Dtype, float>> conv =
+                    std::make_shared<operators::ConvOp<Dtype, float>>(
+                        op->Type(), op->GetInputs(), op->GetOutputs(),
+                        op->GetAttrMap(), program_.scope);
+                ops_of_block_[*block_desc.get()].push_back(conv);
            }
        }
+    }
+}

-        template <typename Dtype>
-        std::shared_ptr<Tensor> Executor<Dtype>::predict(Tensor &t) {
-            // feed
-            auto scope = program_.scope;
-            Variable *g_feed_value = scope->Var("pixel");
-            auto tensor = g_feed_value->GetMutable<Tensor>();
-            tensor->ShareDataWith(t);
+template <typename Dtype>
+std::shared_ptr<Tensor> Executor<Dtype>::predict(Tensor &t) {
+    // feed
+    auto scope = program_.scope;
+    Variable *g_feed_value = scope->Var("pixel");
+    auto tensor = g_feed_value->GetMutable<Tensor>();
+    tensor->ShareDataWith(t);

-            Variable *con_output = scope->Var("conv2d_0.tmp_0");
-            Tensor *output_tensor = con_output->GetMutable<Tensor>();
-            output_tensor->mutable_data<float>({1, 16, 32, 32});
-            //  std::cout << typeid(output_tensor).name() << std::endl;
-            //  std::cout << "output_tensor dims: " << output_tensor->dims() <<
-            //  std::endl;
+    Variable *con_output = scope->Var("conv2d_0.tmp_0");
+    Tensor *output_tensor = con_output->GetMutable<Tensor>();
+    output_tensor->mutable_data<float>({1, 16, 32, 32});
+    //  std::cout << typeid(output_tensor).name() << std::endl;
+    //  std::cout << "output_tensor dims: " << output_tensor->dims() <<
+    //  std::endl;

-            std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-            out_tensor.reset(output_tensor);
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);

-            predict(t, 0);
-            return out_tensor;
-        }
+    predict(t, 0);
+    return out_tensor;
+}

-        template <typename Dtype>
-        void Executor<Dtype>::predict(const Tensor &t, int block_id) {
-            std::shared_ptr<BlockDesc> to_predict_block =
-                to_predict_program_->Block(block_id);
-            for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size();
-                 ++j) {
-                auto op = ops_of_block_[*to_predict_block.get()][j];
-                //    std::cout << "开始run" << std::endl;
-                op->Run();
-            }
-        }
+template <typename Dtype>
+void Executor<Dtype>::predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+        auto op = ops_of_block_[*to_predict_block.get()][j];
+        //    std::cout << "开始run" << std::endl;
+        op->Run();
+    }
+}

-        template class Executor<CPU>;
+template class Executor<CPU>;

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -32,22 +32,22 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        template <typename Dtype> class Executor {
-          public:
-            Executor(const Program<Dtype> p);
-            std::shared_ptr<Tensor> predict(Tensor &t);
+template <typename Dtype> class Executor {
+  public:
+    Executor(const Program<Dtype> p);
+    std::shared_ptr<Tensor> predict(Tensor &t);

-          private:
-            const framework::Program<Dtype> program_;
-            std::shared_ptr<ProgramDesc> to_predict_program_;
-            void predict(const Tensor &t, int block_id);
-            std::map<framework::BlockDesc,
-                     std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-                ops_of_block_;
-            bool use_optimize_ = false;
-        };
+  private:
+    const framework::Program<Dtype> program_;
+    std::shared_ptr<ProgramDesc> to_predict_program_;
+    void predict(const Tensor &t, int block_id);
+    std::map<framework::BlockDesc,
+             std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+        ops_of_block_;
+    bool use_optimize_ = false;
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/framework.pb.cpp
+++ b/src/framework/framework.pb.cpp
--- a/src/framework/framework.pb.h
+++ b/src/framework/framework.pb.h
--- a/src/framework/lod_tensor.cc
+++ b/src/framework/lod_tensor.cc
@@ -19,304 +19,295 @@ limitations under the License. */
 #include <string.h>

 namespace paddle_mobile {
-    namespace framework {
-
-        std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-            os << "{";
-            for (auto &v : lod) {
-                os << "{";
-                bool is_first = true;
-                for (auto &i : v) {
-                    if (is_first) {
-                        os << i;
-                        is_first = false;
-                    } else {
-                        os << ", " << i;
-                    }
-                }
-                os << "}";
+namespace framework {
+
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+    os << "{";
+    for (auto &v : lod) {
+        os << "{";
+        bool is_first = true;
+        for (auto &i : v) {
+            if (is_first) {
+                os << i;
+                is_first = false;
+            } else {
+                os << ", " << i;
            }
-            os << "}";
-
-            return os;
        }
-
-        std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-            //  PADDLE_ENFORCE(t.type().hash_code() ==
-            //  typeid(float).hash_code());
-
-            //  if (!platform::is_cpu_place(t.place())) {
-            //    LoDTensor tt;
-            //    framework::TensorCopy(t, platform::CPUPlace(), &tt);
-            //    platform::DeviceContextPool &pool =
-            //    platform::DeviceContextPool::Instance(); auto &dev_ctx =
-            //    *pool.Get(t.place()); dev_ctx.Wait();
-            //
-            //    os << tt;
-            //    return os;
-            //  }
-
-            os << "dim: " << t.dims() << "\n";
-            os << "lod: " << t.lod() << "\n";
-
-            // only print first ten elements
-            int64_t size = t.numel() < 10 ? t.numel() : 10;
-            for (int64_t i = 0; i < size; ++i) {
-                os << t.data<float>()[i] << " ";
-            }
-
-            return os;
+        os << "}";
+    }
+    os << "}";
+
+    return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+    //  PADDLE_ENFORCE(t.type().hash_code() ==
+    //  typeid(float).hash_code());
+
+    //  if (!platform::is_cpu_place(t.place())) {
+    //    LoDTensor tt;
+    //    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+    //    platform::DeviceContextPool &pool =
+    //    platform::DeviceContextPool::Instance(); auto &dev_ctx =
+    //    *pool.Get(t.place()); dev_ctx.Wait();
+    //
+    //    os << tt;
+    //    return os;
+    //  }
+
+    os << "dim: " << t.dims() << "\n";
+    os << "lod: " << t.lod() << "\n";
+
+    // only print first ten elements
+    int64_t size = t.numel() < 10 ? t.numel() : 10;
+    for (int64_t i = 0; i < size; ++i) {
+        os << t.data<float>()[i] << " ";
+    }
+
+    return os;
+}
+
+std::string LoDToString(const LoD &lod) {
+    std::ostringstream stream;
+    stream << lod;
+    return stream.str();
+}
+
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
+                 size_t elem_end) {
+    //  PADDLE_ENFORCE_LT(level, in.size());
+    //  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+    LoD res;
+    res.resize(in.size() - level);
+    // copy the first level
+    res[0].assign(in[level].begin() + elem_begin,
+                  in[level].begin() + elem_end + 1);
+    for (size_t lvl = 1; lvl < res.size(); lvl++) {
+        const auto &in_level = in[level + lvl];
+        const auto &above_level = res[lvl - 1];
+        auto &out_level = res[lvl];
+        out_level.assign(in_level.begin() + above_level.front(),
+                         in_level.begin() + above_level.back() + 1);
+    }
+    for (size_t lvl = 0; lvl < res.size(); lvl++) {
+        // to make the first offset equals 0, all the elements minus the
+        // first
+        // element
+        size_t front = res[lvl].front();
+        for (auto &ele : res[lvl]) {
+            ele -= front;
        }
-
-        std::string LoDToString(const LoD &lod) {
-            std::ostringstream stream;
-            stream << lod;
-            return stream.str();
+    }
+    return res;
+}
+
+LoD ToAbsOffset(const LoD &in) {
+    // the lowest level stores relative offsets
+    if (in.empty() || in.size() == 1)
+        return in;
+    LoD result = in;
+    for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+        for (size_t i = 0; i < in[level].size(); ++i) {
+            size_t index = in[level][i];
+            result[level][i] = result[level + 1][index];
        }
-
-        LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                         size_t elem_end) {
-            //  PADDLE_ENFORCE_LT(level, in.size());
-            //  PADDLE_ENFORCE_LT(elem_end, in[level].size());
-
-            LoD res;
-            res.resize(in.size() - level);
-            // copy the first level
-            res[0].assign(in[level].begin() + elem_begin,
-                          in[level].begin() + elem_end + 1);
-            for (size_t lvl = 1; lvl < res.size(); lvl++) {
-                const auto &in_level = in[level + lvl];
-                const auto &above_level = res[lvl - 1];
-                auto &out_level = res[lvl];
-                out_level.assign(in_level.begin() + above_level.front(),
-                                 in_level.begin() + above_level.back() + 1);
-            }
-            for (size_t lvl = 0; lvl < res.size(); lvl++) {
-                // to make the first offset equals 0, all the elements minus the
-                // first
-                // element
-                size_t front = res[lvl].front();
-                for (auto &ele : res[lvl]) {
-                    ele -= front;
-                }
-            }
-            return res;
+    }
+    return result;
+}
+
+bool operator==(const LoD &a, const LoD &b) {
+    if (a.size() != b.size()) {
+        return false;
+    }
+
+    for (size_t i = 0; i < a.size(); i++) {
+        const auto &a_level = a[i];
+        const auto &b_level = b[i];
+        if (a_level.size() != b_level.size()) {
+            return false;
        }
-
-        LoD ToAbsOffset(const LoD &in) {
-            // the lowest level stores relative offsets
-            if (in.empty() || in.size() == 1)
-                return in;
-            LoD result = in;
-            for (auto level = static_cast<int>(in.size() - 2); level >= 0;
-                 level--) {
-                for (size_t i = 0; i < in[level].size(); ++i) {
-                    size_t index = in[level][i];
-                    result[level][i] = result[level + 1][index];
-                }
-            }
-            return result;
-        }
-
-        bool operator==(const LoD &a, const LoD &b) {
-            if (a.size() != b.size()) {
+        for (size_t j = 0; j < a_level.size(); j++) {
+            if (a_level[j] != b_level[j]) {
                return false;
            }
-
-            for (size_t i = 0; i < a.size(); i++) {
-                const auto &a_level = a[i];
-                const auto &b_level = b[i];
-                if (a_level.size() != b_level.size()) {
-                    return false;
-                }
-                for (size_t j = 0; j < a_level.size(); j++) {
-                    if (a_level[j] != b_level[j]) {
-                        return false;
-                    }
-                }
-            }
-            return true;
        }
-
-        bool CheckLoD(const LoD &in, int tensor_height) {
-            if (in.empty())
-                return true;
-            for (const auto &level : in) {
-                // check: there should be more than 2 offsets existing in each
-                // level.
-                if (level.size() < 2)
-                    return false;
-                // check: the first offset(the begin offset) of each level
-                // should be 0.
-                if (level.front() != 0)
-                    return false;
-                // check: all the offsets in a level should be ascending(no same
-                // items
-                // allows).
-                if (!std::is_sorted(level.begin(), level.begin(),
-                                    [](size_t a, size_t b) {
-                                        if (a < b)
-                                            return true;
-                                        return false;
-                                    })) {
-                    std::cout << "ascending error";
-                    return false;
-                }
-            }
-            // check: the lowest level's last offset should equals
-            // `tensor_height` if
-            //        tensor_height>0.
-            if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
-                return false;
-
-            // check: the higher level's last offset should equals the lower
-            // level's
-            // size-1.
-            // NOTE LoD store the levels from top to bottom, so the higher level
-            // goes
-            // first.
-            for (size_t level = 0; level < in.size() - 1; level++) {
-                if (in[level].back() != in[level + 1].size() - 1)
-                    return false;
-            }
-            return true;
+    }
+    return true;
+}
+
+bool CheckLoD(const LoD &in, int tensor_height) {
+    if (in.empty())
+        return true;
+    for (const auto &level : in) {
+        // check: there should be more than 2 offsets existing in each
+        // level.
+        if (level.size() < 2)
+            return false;
+        // check: the first offset(the begin offset) of each level
+        // should be 0.
+        if (level.front() != 0)
+            return false;
+        // check: all the offsets in a level should be ascending(no same
+        // items
+        // allows).
+        if (!std::is_sorted(level.begin(), level.begin(),
+                            [](size_t a, size_t b) {
+                                if (a < b)
+                                    return true;
+                                return false;
+                            })) {
+            std::cout << "ascending error";
+            return false;
        }
-
-        bool CheckAbsLoD(const LoD &in, int tensor_height) {
-            if (in.empty())
-                return true;
-            for (const auto &level : in) {
-                // check: all the offsets in a level should be ascending(no same
-                // items
-                // allows).
-                if (!std::is_sorted(level.begin(), level.begin(),
-                                    [](size_t a, size_t b) {
-                                        if (a < b)
-                                            return true;
-                                        return false;
-                                    })) {
-                    return false;
-                }
-
-                // check: there should be more than 2 offsets existing in each
-                // level.
-                if (level.size() < 2)
-                    return false;
-
-                // check: the first offset of each level should be 0, and the
-                // last should be
-                // the same(the height of underlying tensor).
-                if (level.front() != 0)
-                    return false;
-                if (tensor_height < 0) {
-                    tensor_height = level.back();
-                } else if ((size_t)tensor_height != level.back()) {
-                    return false;
-                }
-            }
-            return true;
+    }
+    // check: the lowest level's last offset should equals
+    // `tensor_height` if
+    //        tensor_height>0.
+    if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
+        return false;
+
+    // check: the higher level's last offset should equals the lower
+    // level's
+    // size-1.
+    // NOTE LoD store the levels from top to bottom, so the higher level
+    // goes
+    // first.
+    for (size_t level = 0; level < in.size() - 1; level++) {
+        if (in[level].back() != in[level + 1].size() - 1)
+            return false;
+    }
+    return true;
+}
+
+bool CheckAbsLoD(const LoD &in, int tensor_height) {
+    if (in.empty())
+        return true;
+    for (const auto &level : in) {
+        // check: all the offsets in a level should be ascending(no same
+        // items
+        // allows).
+        if (!std::is_sorted(level.begin(), level.begin(),
+                            [](size_t a, size_t b) {
+                                if (a < b)
+                                    return true;
+                                return false;
+                            })) {
+            return false;
        }

-        using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-
-        LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod,
-                                                size_t start_idx,
-                                                size_t end_idx,
-                                                size_t start_level) {
-            LoD sub_lod;
-
-            for (size_t level_idx = start_level; level_idx < lod.size();
-                 ++level_idx) {
-                //    PADDLE_ENFORCE_LE(start_idx, end_idx);
-                //    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
-                std::vector<size_t> level_lens;
-                for (size_t i = start_idx; i < end_idx; ++i) {
-                    level_lens.push_back(lod[level_idx][i + 1] -
-                                         lod[level_idx][i]);
-                }
-                sub_lod.emplace_back(level_lens);
-                start_idx = lod[level_idx][start_idx];
-                end_idx = lod[level_idx][end_idx];
-            }
-
-            return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+        // check: there should be more than 2 offsets existing in each
+        // level.
+        if (level.size() < 2)
+            return false;
+
+        // check: the first offset of each level should be 0, and the
+        // last should be
+        // the same(the height of underlying tensor).
+        if (level.front() != 0)
+            return false;
+        if (tensor_height < 0) {
+            tensor_height = level.back();
+        } else if ((size_t)tensor_height != level.back()) {
+            return false;
        }
-
-        void AppendLoD(LoD *lod, const LoD &lod_length) {
-            //  PADDLE_ENFORCE(
-            //      lod->empty() || lod->size() == lod_length.size(),
-            //      "The lod_length should has the same size with the appended
-            //      lod.");
-            if (lod->empty()) {
-                for (size_t i = 0; i < lod_length.size(); ++i) {
-                    lod->emplace_back(1, 0); // size = 1, value = 0;
-                }
-                *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-            }
-            for (size_t i = 0; i < lod->size(); ++i) {
-                auto &level = (*lod)[i];
-                for (size_t len : lod_length[i]) {
-                    level.push_back(level.back() + len);
-                }
-            }
+    }
+    return true;
+}
+
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+    LoD sub_lod;
+
+    for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+        //    PADDLE_ENFORCE_LE(start_idx, end_idx);
+        //    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+        std::vector<size_t> level_lens;
+        for (size_t i = start_idx; i < end_idx; ++i) {
+            level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
        }
-
-        void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
-            { // the 1st field, uint32_t version for LoDTensor
-                constexpr uint32_t version = 0;
-                os.write(reinterpret_cast<const char *>(&version),
-                         sizeof(version));
-            }
-            {
-                // the 2st field, LoD information
-                // uint64_t lod_level
-                // uint64_t lod_level_1 size in byte.
-                // int*     lod_level_1 data
-                // ...
-                auto lod = tensor.lod();
-                uint64_t size = lod.size();
-                os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-                for (auto &each : lod) {
-                    size = each.size() *
-                           sizeof(framework::LoD::value_type::value_type);
-                    os.write(reinterpret_cast<const char *>(&size),
-                             sizeof(size));
-                    os.write(reinterpret_cast<const char *>(each.data()),
-                             static_cast<std::streamsize>(size));
-                }
-            }
-            // the 3st field, Tensor
-            TensorToStream(os, static_cast<Tensor>(tensor));
+        sub_lod.emplace_back(level_lens);
+        start_idx = lod[level_idx][start_idx];
+        end_idx = lod[level_idx][end_idx];
+    }
+
+    return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+    //  PADDLE_ENFORCE(
+    //      lod->empty() || lod->size() == lod_length.size(),
+    //      "The lod_length should has the same size with the appended
+    //      lod.");
+    if (lod->empty()) {
+        for (size_t i = 0; i < lod_length.size(); ++i) {
+            lod->emplace_back(1, 0); // size = 1, value = 0;
        }
-
-        void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
-            {
-                // the 1st field, unit32_t version for LoDTensor
-                uint32_t version;
-                is.read(reinterpret_cast<char *>(&version), sizeof(version));
-                //    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is
-                //    supported");
-            }
-            {
-                // the 2st field, LoD information
-                uint64_t lod_level;
-                is.read(reinterpret_cast<char *>(&lod_level),
-                        sizeof(lod_level));
-                auto &lod = *tensor->mutable_lod();
-                lod.resize(lod_level);
-                for (uint64_t i = 0; i < lod_level; ++i) {
-                    uint64_t size;
-                    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-                    std::vector<size_t> tmp(size / sizeof(size_t));
-                    is.read(reinterpret_cast<char *>(tmp.data()),
-                            static_cast<std::streamsize>(size));
-                    lod[i] = tmp;
-                }
-            }
-            // the 3st filed, Tensor
-            TensorFromStream(is, static_cast<Tensor *>(tensor));
+        *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+    }
+    for (size_t i = 0; i < lod->size(); ++i) {
+        auto &level = (*lod)[i];
+        for (size_t len : lod_length[i]) {
+            level.push_back(level.back() + len);
+        }
+    }
+}
+
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
+    { // the 1st field, uint32_t version for LoDTensor
+        constexpr uint32_t version = 0;
+        os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+    {
+        // the 2st field, LoD information
+        // uint64_t lod_level
+        // uint64_t lod_level_1 size in byte.
+        // int*     lod_level_1 data
+        // ...
+        auto lod = tensor.lod();
+        uint64_t size = lod.size();
+        os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+        for (auto &each : lod) {
+            size = each.size() * sizeof(framework::LoD::value_type::value_type);
+            os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+            os.write(reinterpret_cast<const char *>(each.data()),
+                     static_cast<std::streamsize>(size));
+        }
+    }
+    // the 3st field, Tensor
+    TensorToStream(os, static_cast<Tensor>(tensor));
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+    {
+        // the 1st field, unit32_t version for LoDTensor
+        uint32_t version;
+        is.read(reinterpret_cast<char *>(&version), sizeof(version));
+        //    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is
+        //    supported");
+    }
+    {
+        // the 2st field, LoD information
+        uint64_t lod_level;
+        is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+        auto &lod = *tensor->mutable_lod();
+        lod.resize(lod_level);
+        for (uint64_t i = 0; i < lod_level; ++i) {
+            uint64_t size;
+            is.read(reinterpret_cast<char *>(&size), sizeof(size));
+            std::vector<size_t> tmp(size / sizeof(size_t));
+            is.read(reinterpret_cast<char *>(tmp.data()),
+                    static_cast<std::streamsize>(size));
+            lod[i] = tmp;
        }
+    }
+    // the 3st filed, Tensor
+    TensorFromStream(is, static_cast<Tensor *>(tensor));
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -23,190 +23,186 @@ limitations under the License. */

 namespace paddle_mobile {

-    namespace framework {
-
-        /*
-         * LoD is short for Level of Details.
-         *
-         * - in a level, each element indicates relative offset of the lower
-         * level
-         * - the first element should be 0 and that indicates that this sequence
-         * start
-         * from 0
-         * - each sequence's begin and end(no-inclusive) is level[id, id+1]
-         *
-         * For example:
-         *    3-level LoD stores
-         *
-         *    0 2 3
-         *    0 2 4 7
-         *    0 2 5 7 10 12 15 20
-         */
-        using LoD = std::vector<std::vector<size_t>>;
-
-        std::ostream &operator<<(std::ostream &os, const LoD &lod);
-
-        std::ostream &operator<<(std::ostream &os, const LoDTensor &t);
-
-        std::string LoDToString(const LoD &lod);
-
-        LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                         size_t elem_end);
-
-        /*
-         * Transform an LoD from relative offsets to absolute offsets.
-         */
-        LoD ToAbsOffset(const LoD &in);
-
-        bool operator==(const LoD &a, const LoD &b);
-
-        /*
-         * Check whether this lod's format is valid.
-         *
-         * ATTENTION:
-         *   - Empty lod is treated as valid.
-         *
-         * It will check two things:
-         *
-         *  1. all the offsets in a level should be ascending(no same items
-         * allows).
-         *  2. there should be more than 2 offsets existing in each level.
-         *  3. the higher level's last offset should equals the lower level's
-         * size-1.
-         *  4. the first offset(the begin offset) of each level should be 0.
-         *  5. the lowest level's last offset should equals `tensor_height` if
-         * tensor_height>0.
-         */
-
-        bool CheckLoD(const LoD &in, int tensor_height = -1);
-
-        /*
-         * Check whether this absolute lod's format is valid.
-         *
-         * ATTENTION:
-         *   - Empty lod is treated as valid.
-         *
-         * It will check two things:
-         *  1. all the offsets in a level should be ascending(no same items
-         * allows)
-         *  2. there should be more than 2 offsets existing in each level.
-         *  3. the first offset of each level should be 0, and the last should
-         * be the
-         *     same(the height of underlying tensor) or `tensor_height` if
-         *     tensor_height>0.
-         */
-        bool CheckAbsLoD(const LoD &in, int tensor_height = -1);
-
-        /*
-         * LoDTensor (Level of details Tensor)
-         * see https://en.wikipedia.org/wiki/Level_of_details for reference.
-         */
-        class LoDTensor : public Tensor {
-          public:
-            LoDTensor() : Tensor() {}
-
-            explicit LoDTensor(const LoD &lod) : lod_(lod) {}
-
-            void set_lod(const LoD &lod) { lod_ = lod; }
-
-            const LoD &lod() const { return lod_; }
-
-            LoD *mutable_lod() { return &lod_; }
-
-            /*
-             * Get the start offset and end offset of an  element from LoD.
-             */
-            std::pair<size_t, size_t> lod_element(size_t level,
-                                                  size_t elem) const {
-                //    PADDLE_ENFORCE_LT(level, NumLevels());
-                //    PADDLE_ENFORCE_LT(elem, NumElements(level));
-                return std::make_pair((lod_)[level][elem],
-                                      (lod_)[level][elem + 1]);
-            }
-
-            /*
-             * Number of LoDTensor's levels, each level has units of data, for
-             * example,
-             * in the sentence's view, article, paragraph, sentence are 3
-             * levels.
-             */
-            size_t NumLevels() const { return lod_.size(); }
-
-            /*
-             * Number of elements in a level.
-             */
-            size_t NumElements(size_t level = 0) const {
-                //    PADDLE_ENFORCE_LT(level, NumLevels());
-                // the last offset is the end of last element
-                return (lod_)[level].size() - 1;
-            }
-
-          private:
-            LoD lod_;
-        };
-
-        /*
-         * Expand the `source` to fit the LoD of `lod`. For example, a `source`
-         * LoDTensor is
-         *  - LoD: [0, 2]
-         *  - tensor: [a0, a1]
-         * a `lod` is
-         *  - LoD: [0 3 5]
-         * returns a new LoDTensor
-         *  - [a0 a0 a0 a1 a1]
-         */
-        template <typename T>
-        LoDTensor LodExpand(const LoDTensor &source, const LoD &lod,
-                            size_t level) {
-            LoD abs_lod = ToAbsOffset(lod);
-            const auto &lod_level = lod[level];
-            size_t num_instances = source.dims()[0];
-
-            // new tensor
-            LoDTensor tensor;
-            tensor.set_lod(lod);
-            auto dims = source.dims();
-            dims[0] = lod_level.back();
-            tensor.Resize(dims);
-            tensor.mutable_data<T>();
-
-            //  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
-            for (size_t ins = 0; ins < num_instances; ins++) {
-                for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1];
-                     elem++) {
-                    auto slice = tensor.Slice(elem, elem + 1);
-                    TensorCopy(source.Slice(ins, ins + 1), &slice);
-                }
-            }
-            return tensor;
+namespace framework {
+
+/*
+ * LoD is short for Level of Details.
+ *
+ * - in a level, each element indicates relative offset of the lower
+ * level
+ * - the first element should be 0 and that indicates that this sequence
+ * start
+ * from 0
+ * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
+ */
+using LoD = std::vector<std::vector<size_t>>;
+
+std::ostream &operator<<(std::ostream &os, const LoD &lod);
+
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t);
+
+std::string LoDToString(const LoD &lod);
+
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
+                 size_t elem_end);
+
+/*
+ * Transform an LoD from relative offsets to absolute offsets.
+ */
+LoD ToAbsOffset(const LoD &in);
+
+bool operator==(const LoD &a, const LoD &b);
+
+/*
+ * Check whether this lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *
+ *  1. all the offsets in a level should be ascending(no same items
+ * allows).
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the higher level's last offset should equals the lower level's
+ * size-1.
+ *  4. the first offset(the begin offset) of each level should be 0.
+ *  5. the lowest level's last offset should equals `tensor_height` if
+ * tensor_height>0.
+ */
+
+bool CheckLoD(const LoD &in, int tensor_height = -1);
+
+/*
+ * Check whether this absolute lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *  1. all the offsets in a level should be ascending(no same items
+ * allows)
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the first offset of each level should be 0, and the last should
+ * be the
+ *     same(the height of underlying tensor) or `tensor_height` if
+ *     tensor_height>0.
+ */
+bool CheckAbsLoD(const LoD &in, int tensor_height = -1);
+
+/*
+ * LoDTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LoDTensor : public Tensor {
+  public:
+    LoDTensor() : Tensor() {}
+
+    explicit LoDTensor(const LoD &lod) : lod_(lod) {}
+
+    void set_lod(const LoD &lod) { lod_ = lod; }
+
+    const LoD &lod() const { return lod_; }
+
+    LoD *mutable_lod() { return &lod_; }
+
+    /*
+     * Get the start offset and end offset of an  element from LoD.
+     */
+    std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
+        //    PADDLE_ENFORCE_LT(level, NumLevels());
+        //    PADDLE_ENFORCE_LT(elem, NumElements(level));
+        return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
+    }
+
+    /*
+     * Number of LoDTensor's levels, each level has units of data, for
+     * example,
+     * in the sentence's view, article, paragraph, sentence are 3
+     * levels.
+     */
+    size_t NumLevels() const { return lod_.size(); }
+
+    /*
+     * Number of elements in a level.
+     */
+    size_t NumElements(size_t level = 0) const {
+        //    PADDLE_ENFORCE_LT(level, NumLevels());
+        // the last offset is the end of last element
+        return (lod_)[level].size() - 1;
+    }
+
+  private:
+    LoD lod_;
+};
+
+/*
+ * Expand the `source` to fit the LoD of `lod`. For example, a `source`
+ * LoDTensor is
+ *  - LoD: [0, 2]
+ *  - tensor: [a0, a1]
+ * a `lod` is
+ *  - LoD: [0 3 5]
+ * returns a new LoDTensor
+ *  - [a0 a0 a0 a1 a1]
+ */
+template <typename T>
+LoDTensor LodExpand(const LoDTensor &source, const LoD &lod, size_t level) {
+    LoD abs_lod = ToAbsOffset(lod);
+    const auto &lod_level = lod[level];
+    size_t num_instances = source.dims()[0];
+
+    // new tensor
+    LoDTensor tensor;
+    tensor.set_lod(lod);
+    auto dims = source.dims();
+    dims[0] = lod_level.back();
+    tensor.Resize(dims);
+    tensor.mutable_data<T>();
+
+    //  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
+    for (size_t ins = 0; ins < num_instances; ins++) {
+        for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
+            auto slice = tensor.Slice(elem, elem + 1);
+            TensorCopy(source.Slice(ins, ins + 1), &slice);
        }
-
-        // Get the absolute offset of a lod[start_level][start_idx:end_idx] and
-        // relative length of details for every levels(i.e., [start_level: ]).
-        //
-        // For example,
-        //   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
-        //   start_level = 0
-        //   start_idx = 1
-        //   end_idx = 3
-        //
-        // Returns:
-        //  LoD = [[1, 4], [2, 4, 2, 3, 2]]
-        //  pair<size_t, size_t> = {11, 24}
-        std::pair<LoD, std::pair<size_t, size_t>>
-        GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
-                                   size_t end_idx, size_t start_level);
-
-        void AppendLoD(LoD *lod, const LoD &lod_length);
-
-        /*
-         * Serialize/Desiralize LoDTensor to std::ostream
-         * You can pass ofstream or ostringstream to serilize to file
-         * or to a in memory string. GPU tensor will be copied to CPU.
-         */
-        void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
-
-        void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
-
-    } // namespace framework
+    }
+    return tensor;
+}
+
+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
+std::pair<LoD, std::pair<size_t, size_t>>
+GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, size_t end_idx,
+                           size_t start_level);
+
+void AppendLoD(LoD *lod, const LoD &lod_length);
+
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
+
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_desc.cpp
+++ b/src/framework/op_desc.cpp
@@ -5,58 +5,55 @@
 #include "op_desc.h"

 namespace paddle_mobile {
-    namespace framework {
-
-        OpDesc::OpDesc(const proto::OpDesc &desc) : desc_(desc) {
-            for (int i = 0; i < desc_.inputs_size(); ++i) {
-                const proto::OpDesc::Var &var = desc_.inputs(i);
-                std::vector<std::string> &args = inputs_[var.parameter()];
-                int arg_size = var.arguments_size();
-                for (int j = 0; j < arg_size; ++j) {
-                    args.push_back(var.arguments(j));
-                }
-            }
-
-            for (int i = 0; i < desc_.outputs_size(); ++i) {
-                const proto::OpDesc::Var &var = desc_.outputs(i);
-                std::vector<std::string> &args = outputs_[var.parameter()];
-                int arg_size = var.arguments_size();
-                for (int j = 0; j < arg_size; ++j) {
-                    args.push_back(var.arguments(j));
-                }
-            }
-
-            for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
-                std::string attr_name = attr.name();
-                if (attr.type() != proto::AttrType::BLOCK) {
-                    attrs_[attr_name] = Attribute::GetAttrValue(attr);
-                    //      if (attr.type() == proto::AttrType::INT){
-                    //        std::cout << " attrName " << attr_name << " " <<
-                    //        attrs_[attr_name].Get<int>() << std::endl;
-                    //      }
-                }
-            }
+namespace framework {
+
+OpDesc::OpDesc(const proto::OpDesc &desc) : desc_(desc) {
+    for (int i = 0; i < desc_.inputs_size(); ++i) {
+        const proto::OpDesc::Var &var = desc_.inputs(i);
+        std::vector<std::string> &args = inputs_[var.parameter()];
+        int arg_size = var.arguments_size();
+        for (int j = 0; j < arg_size; ++j) {
+            args.push_back(var.arguments(j));
        }
-
-        const std::vector<std::string> &
-        OpDesc::Input(const std::string &name) const {
-            return inputs_.find(name)->second;
+    }
+
+    for (int i = 0; i < desc_.outputs_size(); ++i) {
+        const proto::OpDesc::Var &var = desc_.outputs(i);
+        std::vector<std::string> &args = outputs_[var.parameter()];
+        int arg_size = var.arguments_size();
+        for (int j = 0; j < arg_size; ++j) {
+            args.push_back(var.arguments(j));
        }
-
-        const std::vector<std::string> &
-        OpDesc::Output(const std::string &name) const {
-            return outputs_.find(name)->second;
+    }
+
+    for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
+        std::string attr_name = attr.name();
+        if (attr.type() != proto::AttrType::BLOCK) {
+            attrs_[attr_name] = Attribute::GetAttrValue(attr);
+            //      if (attr.type() == proto::AttrType::INT){
+            //        std::cout << " attrName " << attr_name << " " <<
+            //        attrs_[attr_name].Get<int>() << std::endl;
+            //      }
        }
+    }
+}

-        Attribute OpDesc::GetAttr(const std::string &name) const {
-            auto it = attrs_.find(name);
-            return it->second;
-        }
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
+    return inputs_.find(name)->second;
+}

-        const std::unordered_map<std::string, Attribute> &
-        OpDesc::GetAttrMap() const {
-            return attrs_;
-        }
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
+    return outputs_.find(name)->second;
+}
+
+Attribute OpDesc::GetAttr(const std::string &name) const {
+    auto it = attrs_.find(name);
+    return it->second;
+}
+
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
+    return attrs_;
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_desc.h
+++ b/src/framework/op_desc.h
@@ -23,31 +23,29 @@ SOFTWARE.
 #include "paddle_mobile_object.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        class OpDesc : PaddleMobileObject {
-          public:
-            OpDesc(const proto::OpDesc &desc);
-            const std::vector<std::string> &
-            Input(const std::string &name) const;
-            const std::vector<std::string> &
-            Output(const std::string &name) const;
-            Attribute GetAttr(const std::string &name) const;
+class OpDesc : PaddleMobileObject {
+  public:
+    OpDesc(const proto::OpDesc &desc);
+    const std::vector<std::string> &Input(const std::string &name) const;
+    const std::vector<std::string> &Output(const std::string &name) const;
+    Attribute GetAttr(const std::string &name) const;

-            const VariableNameMap &GetInputs() { return inputs_; }
+    const VariableNameMap &GetInputs() { return inputs_; }

-            const VariableNameMap &GetOutputs() { return outputs_; }
+    const VariableNameMap &GetOutputs() { return outputs_; }

-            const AttributeMap &GetAttrMap() const;
+    const AttributeMap &GetAttrMap() const;

-            const std::string &Type() { return desc_.type(); };
+    const std::string &Type() { return desc_.type(); };

-          private:
-            proto::OpDesc desc_;
-            VariableNameMap inputs_;
-            VariableNameMap outputs_;
-            AttributeMap attrs_;
-        };
+  private:
+    proto::OpDesc desc_;
+    VariableNameMap inputs_;
+    VariableNameMap outputs_;
+    AttributeMap attrs_;
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_info.h
+++ b/src/framework/op_info.h
@@ -22,74 +22,73 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-    namespace framework {
-
-        template <typename Dtype> struct OpInfo {
-            OpCreator<Dtype> creator_;
-            const OpCreator<Dtype> &Creator() const {
-                //    PADDLE_ENFORCE_NOT_NULL(creator_,
-                //                            "Operator Creator has not been
-                //                            registered");
-                return creator_;
-            }
-        };
-
-        template <typename Dtype> class OpInfoMap;
-
-        template <typename Dtype>
-        static OpInfoMap<Dtype> *g_op_info_map = nullptr;
-
-        template <typename Dtype> class OpInfoMap {
-          public:
-            static OpInfoMap &Instance() {
-                if (g_op_info_map<Dtype> == nullptr) {
-                    g_op_info_map<Dtype> = new OpInfoMap();
-                }
-                return *g_op_info_map<Dtype>;
-            };
-
-            bool Has(const std::string &op_type) const {
-                return map_.find(op_type) != map_.end();
-            }
-
-            void Insert(const std::string &type, const OpInfo<Dtype> &info) {
-                //    PADDLE_ENFORCE(!Has(type), "Operator %s has been
-                //    registered", type);
-                map_.insert({type, info});
-            }
-
-            const OpInfo<Dtype> &Get(const std::string &type) const {
-                auto op_info_ptr = GetNullable(type);
-                //    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not
-                //    been
-                //    registered",
-                //                            type);
-                return *op_info_ptr;
-            }
-
-            const OpInfo<Dtype> *GetNullable(const std::string &type) const {
-                auto it = map_.find(type);
-                if (it == map_.end()) {
-                    return nullptr;
-                } else {
-                    return &it->second;
-                }
-            }
-
-            const std::unordered_map<std::string, OpInfo<Dtype>> &map() const {
-                return map_;
-            }
-
-            std::unordered_map<std::string, OpInfo<Dtype>> *mutable_map() {
-                return &map_;
-            }
-
-          private:
-            OpInfoMap() = default;
-            std::unordered_map<std::string, OpInfo<Dtype>> map_;
-
-            //  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
-        };
-
-    } // namespace framework
+namespace framework {
+
+template <typename Dtype> struct OpInfo {
+    OpCreator<Dtype> creator_;
+    const OpCreator<Dtype> &Creator() const {
+        //    PADDLE_ENFORCE_NOT_NULL(creator_,
+        //                            "Operator Creator has not been
+        //                            registered");
+        return creator_;
+    }
+};
+
+template <typename Dtype> class OpInfoMap;
+
+template <typename Dtype> static OpInfoMap<Dtype> *g_op_info_map = nullptr;
+
+template <typename Dtype> class OpInfoMap {
+  public:
+    static OpInfoMap &Instance() {
+        if (g_op_info_map<Dtype> == nullptr) {
+            g_op_info_map<Dtype> = new OpInfoMap();
+        }
+        return *g_op_info_map<Dtype>;
+    };
+
+    bool Has(const std::string &op_type) const {
+        return map_.find(op_type) != map_.end();
+    }
+
+    void Insert(const std::string &type, const OpInfo<Dtype> &info) {
+        //    PADDLE_ENFORCE(!Has(type), "Operator %s has been
+        //    registered", type);
+        map_.insert({type, info});
+    }
+
+    const OpInfo<Dtype> &Get(const std::string &type) const {
+        auto op_info_ptr = GetNullable(type);
+        //    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not
+        //    been
+        //    registered",
+        //                            type);
+        return *op_info_ptr;
+    }
+
+    const OpInfo<Dtype> *GetNullable(const std::string &type) const {
+        auto it = map_.find(type);
+        if (it == map_.end()) {
+            return nullptr;
+        } else {
+            return &it->second;
+        }
+    }
+
+    const std::unordered_map<std::string, OpInfo<Dtype>> &map() const {
+        return map_;
+    }
+
+    std::unordered_map<std::string, OpInfo<Dtype>> *mutable_map() {
+        return &map_;
+    }
+
+  private:
+    OpInfoMap() = default;
+    std::unordered_map<std::string, OpInfo<Dtype>> map_;
+
+    //  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
+};
+
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_kernel_type.h
+++ b/src/framework/op_kernel_type.h
@@ -22,51 +22,44 @@ SOFTWARE.
 #include "framework.pb.h"

 namespace paddle_mobile {
-    namespace framework {
-        struct OpKernelType {
-            struct Hash {
-                size_t operator()(const OpKernelType &key) const {
-                    int data_type = static_cast<int>(key.data_type_)
-                                    << LEFT_SHIFT;
-                    int data_layout = static_cast<int>(key.data_layout_)
-                                      << (LEFT_SHIFT * 2);
+namespace framework {
+struct OpKernelType {
+    struct Hash {
+        size_t operator()(const OpKernelType &key) const {
+            int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
+            int data_layout = static_cast<int>(key.data_layout_)
+                              << (LEFT_SHIFT * 2);

-                    std::hash<int> hasher;
-                    return hasher(data_type + data_layout);
-                }
-            };
+            std::hash<int> hasher;
+            return hasher(data_type + data_layout);
+        }
+    };

-            // place, data_type, library_type kinds less than 2^8
-            constexpr static int LEFT_SHIFT = 8;
+    // place, data_type, library_type kinds less than 2^8
+    constexpr static int LEFT_SHIFT = 8;

-            proto::VarType::Type data_type_;
-            DataLayout data_layout_;
+    proto::VarType::Type data_type_;
+    DataLayout data_layout_;

-            OpKernelType(proto::VarType::Type data_type,
-                         DataLayout data_layout = DataLayout::kAnyLayout)
-                : data_type_(data_type), data_layout_(data_layout) {}
+    OpKernelType(proto::VarType::Type data_type,
+                 DataLayout data_layout = DataLayout::kAnyLayout)
+        : data_type_(data_type), data_layout_(data_layout) {}

-            bool operator==(const OpKernelType &o) const {
-                return data_type_ == o.data_type_ &&
-                       data_layout_ == o.data_layout_;
-            }
+    bool operator==(const OpKernelType &o) const {
+        return data_type_ == o.data_type_ && data_layout_ == o.data_layout_;
+    }

-            bool operator!=(const OpKernelType &o) const {
-                return !(*this == o);
-            }
-        };
+    bool operator!=(const OpKernelType &o) const { return !(*this == o); }
+};

-        inline bool NeedTransformLayout(const DataLayout &l,
-                                        const DataLayout &r) {
-            return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout &&
-                   l != r;
-        }
+inline bool NeedTransformLayout(const DataLayout &l, const DataLayout &r) {
+    return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+}

-        inline bool TransFromNeeded(const OpKernelType &l,
-                                    const OpKernelType &r) {
-            return (l.data_type_ != r.data_type_) ||
-                   NeedTransformLayout(l.data_layout_, r.data_layout_);
-        }
+inline bool TransFromNeeded(const OpKernelType &l, const OpKernelType &r) {
+    return (l.data_type_ != r.data_type_) ||
+           NeedTransformLayout(l.data_layout_, r.data_layout_);
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/op_proto_maker.h
+++ b/src/framework/op_proto_maker.h
@@ -19,8 +19,8 @@ SOFTWARE.
 #pragma once

 namespace paddle_mobile {
-    namespace framework {
-        // this class not only make proto but also init attribute checkers.
-        class OpProtoAndCheckerMaker {};
-    } // namespace framework
+namespace framework {
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {};
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -20,23 +20,23 @@ SOFTWARE.
 #include "op_info.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        template <typename Dtype>
-        OperatorBase<Dtype>::OperatorBase(const std::string &type,
-                                          const VariableNameMap &inputs,
-                                          const VariableNameMap &outputs,
-                                          const AttributeMap &attrs,
-                                          std::shared_ptr<Scope> scope)
-            : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs),
-              scope_(scope) {
-            CheckAllInputOutputSet();
-        }
-        template <typename Dtype>
-        void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
+template <typename Dtype>
+OperatorBase<Dtype>::OperatorBase(const std::string &type,
+                                  const VariableNameMap &inputs,
+                                  const VariableNameMap &outputs,
+                                  const AttributeMap &attrs,
+                                  std::shared_ptr<Scope> scope)
+    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs),
+      scope_(scope) {
+    CheckAllInputOutputSet();
+}
+template <typename Dtype>
+void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}

-        template class OperatorBase<CPU>;
-        template class OperatorWithKernel<CPU>;
+template class OperatorBase<CPU>;
+template class OperatorWithKernel<CPU>;

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -33,68 +33,64 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-    namespace framework {
-        static std::unordered_map<std::string, std::vector<std::string>>
-            op_input_output_key = {
-                {"conv2d", {"Input", "Output"}},   {"relu", {"X", "Out"}},
-                {"softmax", {"X", "Out"}},         {"mul", {"X", "Out"}},
-                {"elementwise_add", {"X", "Out"}}, {"pool2d", {"X", "Out"}},
-                {"batch_norm", {"X", "Y"}},        {"lrn", {"X", "Out"}},
-                {"concat", {"X", "Out"}},
+namespace framework {
+static std::unordered_map<std::string, std::vector<std::string>>
+    op_input_output_key = {
+        {"conv2d", {"Input", "Output"}},   {"relu", {"X", "Out"}},
+        {"softmax", {"X", "Out"}},         {"mul", {"X", "Out"}},
+        {"elementwise_add", {"X", "Out"}}, {"pool2d", {"X", "Out"}},
+        {"batch_norm", {"X", "Y"}},        {"lrn", {"X", "Out"}},
+        {"concat", {"X", "Out"}},

-        };
+};

-        template <typename Dtype> class OperatorBase : PaddleMobileObject {
-          public:
-            OperatorBase(const std::string &type, const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const AttributeMap &attrs,
-                         std::shared_ptr<Scope> scope);
-            virtual ~OperatorBase() {}
-            virtual void Run() const = 0;
+template <typename Dtype> class OperatorBase : PaddleMobileObject {
+  public:
+    OperatorBase(const std::string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs, const AttributeMap &attrs,
+                 std::shared_ptr<Scope> scope);
+    virtual ~OperatorBase() {}
+    virtual void Run() const = 0;

-            const VariableNameMap &Inputs() const { return inputs_; }
-            const VariableNameMap &Outputs() const { return outputs_; }
-            const std::string &Type() const { return type_; }
-            const AttributeMap &Attrs() const { return attrs_; }
-            void ClearVariables() const {
-                if (this->scope_) {
-                    this->scope_->EraseVars(this->inputs_.at("Filter"));
-                    this->scope_->EraseVars(this->inputs_.at("Input"));
-                }
-            }
+    const VariableNameMap &Inputs() const { return inputs_; }
+    const VariableNameMap &Outputs() const { return outputs_; }
+    const std::string &Type() const { return type_; }
+    const AttributeMap &Attrs() const { return attrs_; }
+    void ClearVariables() const {
+        if (this->scope_) {
+            this->scope_->EraseVars(this->inputs_.at("Filter"));
+            this->scope_->EraseVars(this->inputs_.at("Input"));
+        }
+    }

-          protected:
-            std::shared_ptr<Scope> scope_;
-            std::string type_;
-            VariableNameMap inputs_;
-            VariableNameMap outputs_;
-            AttributeMap attrs_;
+  protected:
+    std::shared_ptr<Scope> scope_;
+    std::string type_;
+    VariableNameMap inputs_;
+    VariableNameMap outputs_;
+    AttributeMap attrs_;

-          private:
-            void CheckAllInputOutputSet() const;
-        };
+  private:
+    void CheckAllInputOutputSet() const;
+};

-        template <typename Dtype>
-        class OperatorWithKernel : public OperatorBase<Dtype> {
-          public:
-            OperatorWithKernel(const std::string &type,
-                               const VariableNameMap &inputs,
-                               const VariableNameMap &outputs,
-                               const AttributeMap &attrs,
-                               std::shared_ptr<Scope> scope)
-                : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
-            virtual void InferShape() const = 0;
-            virtual void Run() const = 0;
-        };
+template <typename Dtype>
+class OperatorWithKernel : public OperatorBase<Dtype> {
+  public:
+    OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs, std::shared_ptr<Scope> scope)
+        : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
+    virtual void InferShape() const = 0;
+    virtual void Run() const = 0;
+};

-        template <typename Dtype, typename P>
-        class OpKernelBase : PaddleMobileObject {
-          public:
-            virtual void Compute(const P &para) const = 0;
+template <typename Dtype, typename P> class OpKernelBase : PaddleMobileObject {
+  public:
+    virtual void Compute(const P &para) const = 0;

-            virtual ~OpKernelBase() = default;
-        };
+    virtual ~OpKernelBase() = default;
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/paddle_mobile_object.h
+++ b/src/framework/paddle_mobile_object.h
@@ -23,14 +23,14 @@ SOFTWARE.

 namespace paddle_mobile {

-    class PaddleMobileObject {
-      public:
-        virtual inline const std::string &ToString() {
-            char address[128] = {0};
-            sprintf(address, "%p", this);
-            return std::string(address);
-        }
+class PaddleMobileObject {
+  public:
+    virtual inline const std::string &ToString() {
+        char address[128] = {0};
+        sprintf(address, "%p", this);
+        return std::string(address);
+    }

-      private:
-    };
+  private:
+};
 } // namespace paddle_mobile
--- a/src/framework/program.cpp
+++ b/src/framework/program.cpp
@@ -17,5 +17,5 @@ SOFTWARE.
 ==============================================================================*/

 namespace paddle_mobile {
-    namespace framework {}
+namespace framework {}
 } // namespace paddle_mobile
--- a/src/framework/program.h
+++ b/src/framework/program.h
@@ -24,17 +24,17 @@ SOFTWARE.
 #include "scope.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        template <typename Dtype, Precision P = Precision::FP32>
-        class Program : PaddleMobileObject {
-          public:
-            std::shared_ptr<ProgramDesc> originProgram;
-            std::shared_ptr<ProgramDesc> optimizeProgram;
-            std::shared_ptr<Scope> scope;
+template <typename Dtype, Precision P = Precision::FP32>
+class Program : PaddleMobileObject {
+  public:
+    std::shared_ptr<ProgramDesc> originProgram;
+    std::shared_ptr<ProgramDesc> optimizeProgram;
+    std::shared_ptr<Scope> scope;

-          private:
-        };
+  private:
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/program_desc.cpp
+++ b/src/framework/program_desc.cpp
@@ -5,18 +5,18 @@
 #include "program_desc.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) : desc_(desc) {
-            for (auto &block_desc : *desc_.mutable_blocks()) {
-                // new framework::BlockDesc(block_desc)
-                blocks_.emplace_back(std::make_shared<BlockDesc>(block_desc));
-            }
-        }
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) : desc_(desc) {
+    for (auto &block_desc : *desc_.mutable_blocks()) {
+        // new framework::BlockDesc(block_desc)
+        blocks_.emplace_back(std::make_shared<BlockDesc>(block_desc));
+    }
+}

-        std::shared_ptr<BlockDesc> ProgramDesc::Block(size_t idx) {
-            return blocks_[idx];
-        }
+std::shared_ptr<BlockDesc> ProgramDesc::Block(size_t idx) {
+    return blocks_[idx];
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/program_desc.h
+++ b/src/framework/program_desc.h
@@ -25,20 +25,18 @@ SOFTWARE.
 #include "paddle_mobile_object.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        class ProgramDesc : PaddleMobileObject {
-          public:
-            ProgramDesc(const proto::ProgramDesc &desc);
-            std::shared_ptr<BlockDesc> Block(size_t idx);
-            const std::vector<std::shared_ptr<BlockDesc>> &Blocks() {
-                return blocks_;
-            };
+class ProgramDesc : PaddleMobileObject {
+  public:
+    ProgramDesc(const proto::ProgramDesc &desc);
+    std::shared_ptr<BlockDesc> Block(size_t idx);
+    const std::vector<std::shared_ptr<BlockDesc>> &Blocks() { return blocks_; };

-          private:
-            std::vector<std::shared_ptr<BlockDesc>> blocks_;
-            proto::ProgramDesc desc_;
-        };
+  private:
+    std::vector<std::shared_ptr<BlockDesc>> blocks_;
+    proto::ProgramDesc desc_;
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/scope.cc
+++ b/src/framework/scope.cc
@@ -4,116 +4,116 @@
 #include <vector>

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        Scope &Scope::NewScope() const {
-            std::unique_lock<std::mutex> lock(mutex_);
-            kids_.push_back(new Scope(this));
-            return *kids_.back();
-        }
+Scope &Scope::NewScope() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    kids_.push_back(new Scope(this));
+    return *kids_.back();
+}

-        Variable *Scope::Var(const std::string &name) {
-            auto *pvar = FindVarLocally(name);
-            if (pvar != nullptr) {
-                return pvar;
-            };
-            pvar = new Variable;
-            vars_[name] = pvar;
-            pvar->name_ = &(vars_.find(name)->first);
-            return pvar;
-        }
+Variable *Scope::Var(const std::string &name) {
+    auto *pvar = FindVarLocally(name);
+    if (pvar != nullptr) {
+        return pvar;
+    };
+    pvar = new Variable;
+    vars_[name] = pvar;
+    pvar->name_ = &(vars_.find(name)->first);
+    return pvar;
+}

-        //            Variable* Scope::Var(std::string* name) {
-        //                auto var_name = string::Sprintf("%p.%d", this,
-        //                vars_.size());
-        //                if (name != nullptr) {
-        //                    *name = var_name;
-        //                }
-        //                return Var(var_name);
-        //            }
+//            Variable* Scope::Var(std::string* name) {
+//                auto var_name = string::Sprintf("%p.%d", this,
+//                vars_.size());
+//                if (name != nullptr) {
+//                    *name = var_name;
+//                }
+//                return Var(var_name);
+//            }

-        Variable *Scope::FindVar(const std::string &name) const {
-            auto *pvar = FindVarLocally(name);
-            if (pvar != nullptr) {
-                return pvar;
-            }
-            return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
-        }
+Variable *Scope::FindVar(const std::string &name) const {
+    auto *pvar = FindVarLocally(name);
+    if (pvar != nullptr) {
+        return pvar;
+    }
+    return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}

-        const Scope *Scope::FindScope(const Variable *var) const {
-            for (auto &name_var : vars_) {
-                if (name_var.second == var) {
-                    return this;
-                }
-            }
-            return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+const Scope *Scope::FindScope(const Variable *var) const {
+    for (auto &name_var : vars_) {
+        if (name_var.second == var) {
+            return this;
        }
+    }
+    return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}

-        void Scope::DropKids() {
-            for (Scope *s : kids_) {
-                delete s;
-            }
-            kids_.clear();
-        }
+void Scope::DropKids() {
+    for (Scope *s : kids_) {
+        delete s;
+    }
+    kids_.clear();
+}

-        std::vector<std::string> Scope::LocalVarNames() const {
-            std::vector<std::string> known_vars;
-            known_vars.reserve(vars_.size());
-            for (auto &name_var : vars_) {
-                known_vars.emplace_back(name_var.first);
-            }
-            return known_vars;
-        }
+std::vector<std::string> Scope::LocalVarNames() const {
+    std::vector<std::string> known_vars;
+    known_vars.reserve(vars_.size());
+    for (auto &name_var : vars_) {
+        known_vars.emplace_back(name_var.first);
+    }
+    return known_vars;
+}

-        void Scope::DeleteScope(Scope *scope) const {
-            std::unique_lock<std::mutex> lock(mutex_);
-            auto it = std::find(kids_.begin(), kids_.end(), scope);
-            kids_.erase(it);
-            delete scope;
-            // deferent
-        }
+void Scope::DeleteScope(Scope *scope) const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    auto it = std::find(kids_.begin(), kids_.end(), scope);
+    kids_.erase(it);
+    delete scope;
+    // deferent
+}

-        void Scope::EraseVars(const std::vector<std::string> &var_names) {
-            std::set<std::string> var_set(var_names.begin(), var_names.end());
-            for (auto it = vars_.begin(); it != vars_.end();) {
-                if (var_set.find(it->first) != var_set.end()) {
-                    delete it->second;
-                    it = vars_.erase(it);
-                } else {
-                    ++it;
-                }
-            }
+void Scope::EraseVars(const std::vector<std::string> &var_names) {
+    std::set<std::string> var_set(var_names.begin(), var_names.end());
+    for (auto it = vars_.begin(); it != vars_.end();) {
+        if (var_set.find(it->first) != var_set.end()) {
+            delete it->second;
+            it = vars_.erase(it);
+        } else {
+            ++it;
        }
+    }
+}

-        void Scope::Rename(const std::string &origin_name,
-                           const std::string &new_name) const {
-            auto origin_it = vars_.find(origin_name);
-            if (origin_it == vars_.end()) {
-                return;
-            }
-            auto new_it = vars_.find(new_name);
-            if (new_it != vars_.end()) {
-                return;
-            }
-            vars_[new_name] = origin_it->second;
-            vars_.erase(origin_it);
-        }
-        //
-        //            std::string Scope::Rename(const std::string& origin_name)
-        //            const {
-        //                auto var_name = string::Sprintf("%p.%d", this,
-        //                vars_.size());
-        //                Rename(origin_name, var_name);
-        //                return var_name;
-        //            }
+void Scope::Rename(const std::string &origin_name,
+                   const std::string &new_name) const {
+    auto origin_it = vars_.find(origin_name);
+    if (origin_it == vars_.end()) {
+        return;
+    }
+    auto new_it = vars_.find(new_name);
+    if (new_it != vars_.end()) {
+        return;
+    }
+    vars_[new_name] = origin_it->second;
+    vars_.erase(origin_it);
+}
+//
+//            std::string Scope::Rename(const std::string& origin_name)
+//            const {
+//                auto var_name = string::Sprintf("%p.%d", this,
+//                vars_.size());
+//                Rename(origin_name, var_name);
+//                return var_name;
+//            }

-        Variable *Scope::FindVarLocally(const std::string &name) const {
-            auto it = vars_.find(name);
-            if (it != vars_.end()) {
-                return it->second;
-            }
-            return nullptr;
-        }
+Variable *Scope::FindVarLocally(const std::string &name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+        return it->second;
+    }
+    return nullptr;
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -24,58 +24,58 @@ SOFTWARE.
 #include <unordered_map> //std::unordered_map

 namespace paddle_mobile {
-    namespace framework {
-        class Scope {
-          public:
-            Scope() {}
-            ~Scope() {}
+namespace framework {
+class Scope {
+  public:
+    Scope() {}
+    ~Scope() {}

-            Scope &NewScope() const;
+    Scope &NewScope() const;

-            /// Create a variable with given name if it doesn't exist.
-            Variable *Var(const std::string &name);
+    /// Create a variable with given name if it doesn't exist.
+    Variable *Var(const std::string &name);

-            /// Create a variable with a scope-unique name.
-            Variable *Var(std::string *name = nullptr);
+    /// Create a variable with a scope-unique name.
+    Variable *Var(std::string *name = nullptr);

-            void EraseVars(const std::vector<std::string> &var_names);
+    void EraseVars(const std::vector<std::string> &var_names);

-            /// Find a variable in the scope or any of its ancestors.  Returns
-            /// nullptr if cannot find.
-            Variable *FindVar(const std::string &name) const;
+    /// Find a variable in the scope or any of its ancestors.  Returns
+    /// nullptr if cannot find.
+    Variable *FindVar(const std::string &name) const;

-            const Scope *parent() const { return parent_; }
+    const Scope *parent() const { return parent_; }

-            /// Find the scope or an ancestor scope that contains the given
-            /// variable.
-            const Scope *FindScope(const Variable *var) const;
+    /// Find the scope or an ancestor scope that contains the given
+    /// variable.
+    const Scope *FindScope(const Variable *var) const;

-            void DeleteScope(Scope *scope) const;
+    void DeleteScope(Scope *scope) const;

-            /// Drop all kids scopes belonged to this scope.
-            void DropKids();
+    /// Drop all kids scopes belonged to this scope.
+    void DropKids();

-            // enumerate all the variables current contains.
-            std::vector<std::string> LocalVarNames() const;
+    // enumerate all the variables current contains.
+    std::vector<std::string> LocalVarNames() const;

-            // Rename variable to a new name
-            void Rename(const std::string &origin_name,
-                        const std::string &new_name) const;
+    // Rename variable to a new name
+    void Rename(const std::string &origin_name,
+                const std::string &new_name) const;

-            // Rename variable to a new name and return the new name
-            std::string Rename(const std::string &origin_name) const;
+    // Rename variable to a new name and return the new name
+    std::string Rename(const std::string &origin_name) const;

-            Variable *FindVarLocally(const std::string &name) const;
+    Variable *FindVarLocally(const std::string &name) const;

-          private:
-            // Call Scope::NewScope for a sub-scope.
-            explicit Scope(Scope const *parent) : parent_(parent) {}
+  private:
+    // Call Scope::NewScope for a sub-scope.
+    explicit Scope(Scope const *parent) : parent_(parent) {}

-            mutable std::unordered_map<std::string, Variable *> vars_;
-            mutable std::list<Scope *> kids_;
-            Scope const *parent_{nullptr};
+    mutable std::unordered_map<std::string, Variable *> vars_;
+    mutable std::list<Scope *> kids_;
+    Scope const *parent_{nullptr};

-            mutable std::mutex mutex_;
-        };
-    } // namespace framework
+    mutable std::mutex mutex_;
+};
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
@@ -24,59 +24,58 @@ SOFTWARE.
 #include "tensor.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        class SelectedRows {
-          public:
-            SelectedRows(const std::vector<int64_t> &rows,
-                         const int64_t &height)
-                : rows_(rows), height_(height) {
-                value_.reset(new Tensor());
-            }
+class SelectedRows {
+  public:
+    SelectedRows(const std::vector<int64_t> &rows, const int64_t &height)
+        : rows_(rows), height_(height) {
+        value_.reset(new Tensor());
+    }

-            SelectedRows() {
-                height_ = 0;
-                value_.reset(new Tensor());
-            }
+    SelectedRows() {
+        height_ = 0;
+        value_.reset(new Tensor());
+    }

-            const Tensor &value() const { return *value_; }
+    const Tensor &value() const { return *value_; }

-            Tensor *mutable_value() { return value_.get(); }
+    Tensor *mutable_value() { return value_.get(); }

-            int64_t height() const { return height_; }
+    int64_t height() const { return height_; }

-            void set_height(int64_t height) { height_ = height; }
+    void set_height(int64_t height) { height_ = height; }

-            const std::vector<int64_t> &rows() const { return rows_; }
+    const std::vector<int64_t> &rows() const { return rows_; }

-            std::vector<int64_t> *mutable_rows() { return &rows_; }
+    std::vector<int64_t> *mutable_rows() { return &rows_; }

-            void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }
+    void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }

-            /**
-             * get the index of id in rows
-             */
-            int64_t index(int64_t id) const {
-                auto it = std::find(rows_.begin(), rows_.end(), id);
-                //    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
-                return static_cast<int64_t>(std::distance(rows_.begin(), it));
-            }
+    /**
+     * get the index of id in rows
+     */
+    int64_t index(int64_t id) const {
+        auto it = std::find(rows_.begin(), rows_.end(), id);
+        //    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+        return static_cast<int64_t>(std::distance(rows_.begin(), it));
+    }

-            DDim GetCompleteDims() const {
-                std::vector<int64_t> dims = vectorize(value_->dims());
-                dims[0] = height_;
-                return make_ddim(dims);
-            }
+    DDim GetCompleteDims() const {
+        std::vector<int64_t> dims = vectorize(value_->dims());
+        dims[0] = height_;
+        return make_ddim(dims);
+    }

-          private:
-            // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9}
-            // here.
-            // SelectedRows are simply concated when adding together. Until a
-            // SelectedRows add a Tensor, will the duplicate rows be handled.
-            std::vector<int64_t> rows_;
-            std::unique_ptr<Tensor> value_{nullptr};
-            int64_t height_;
-        };
+  private:
+    // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9}
+    // here.
+    // SelectedRows are simply concated when adding together. Until a
+    // SelectedRows add a Tensor, will the duplicate rows be handled.
+    std::vector<int64_t> rows_;
+    std::unique_ptr<Tensor> value_{nullptr};
+    int64_t height_;
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -25,316 +25,310 @@ limitations under the License. */
 #include "memory/t_malloc.h"

 namespace paddle_mobile {
-    namespace framework {
-        template <typename... T> struct SizeOfTypeFunctor;
-
-        template <typename T> struct SizeOfTypeFunctor<T> {
-            size_t operator()(std::type_index type) const {
-                if (typeid(T).hash_code() == type.hash_code()) {
-                    return sizeof(T);
-                } else {
-                    return 0UL;
-                }
-            }
-        };
-
-        template <> struct SizeOfTypeFunctor<> {
-            size_t operator()(std::type_index type) const { return 0UL; }
-        };
-
-        template <typename HEAD, typename... TAIL>
-        struct SizeOfTypeFunctor<HEAD, TAIL...> {
-            size_t operator()(std::type_index type) const {
-                SizeOfTypeFunctor<HEAD> head;
-                size_t head_size = head(type);
-                if (head_size != 0) {
-                    return head_size;
-                }
-                SizeOfTypeFunctor<TAIL...> tail;
-                return tail(type);
-            }
-        };
-
-        static inline size_t SizeOfType(std::type_index type) {
-            SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool,
-                              size_t>
-                functor;
-            size_t size = functor(type);
-            //  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s",
-            //  type.name());
-            return size;
+namespace framework {
+template <typename... T> struct SizeOfTypeFunctor;
+
+template <typename T> struct SizeOfTypeFunctor<T> {
+    size_t operator()(std::type_index type) const {
+        if (typeid(T).hash_code() == type.hash_code()) {
+            return sizeof(T);
+        } else {
+            return 0UL;
        }
-
-        class LoDTensor;
-
-        class Tensor {
-          public:
-            Tensor() : offset_(0) {}
-
-            /*! Return a pointer to mutable memory block. */
-            template <typename T> inline T *data() {
-                check_memory_size();
-                //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                //                     holder_->type().hash_code() ==
-                //                     typeid(T).hash_code(),
-                //                 "Tensor holds the wrong type, it holds %s",
-                //                 this->holder_->type().name());
-                return reinterpret_cast<T *>(
-                    reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-            }
-
-            /*! Return a pointer to constant memory block. */
-            template <typename T> inline const T *data() const {
-                check_memory_size();
-                //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                //                     holder_->type().hash_code() ==
-                //                     typeid(T).hash_code(),
-                //                 "Tensor holds the wrong type, it holds %s",
-                //                 this->holder_->type().name());
-
-                return reinterpret_cast<const T *>(
-                    reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-            }
-
-            inline bool IsInitialized() const { return holder_ != nullptr; }
-
-            /**
-             * @brief   Return a pointer to mutable memory block.
-             * @note    If not exist, then allocation.
-             */
-            template <typename T> inline T *mutable_data() {
-                static_assert(std::is_pod<T>::value, "T must be POD");
-                return reinterpret_cast<T *>(mutable_data(typeid(T)));
-            }
-
-            inline void *mutable_data(std::type_index type) {
-                if (holder_ != nullptr) {
-                    holder_->set_type(type);
-                }
-                //  PADDLE_ENFORCE_GE(numel(), 0,
-                //                    "When calling this method, the Tensor's
-                //                    numel must be
-                //                    " "equal or larger than zero. " "Please
-                //                    check
-                //                    Tensor::Resize has been called first.");
-                int64_t size = numel() * SizeOfType(type);
-                /* some versions of boost::variant don't have operator!= */
-                if (holder_ == nullptr || holder_->size() < size + offset_) {
-                    holder_.reset(new PlaceholderImpl(size, type));
-
-                    offset_ = 0;
-                }
-                return reinterpret_cast<void *>(
-                    reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-            }
-
-            inline void *mutable_data() {
-                //  PADDLE_ENFORCE(this->holder_ != nullptr,
-                //                 "Cannot invoke mutable data if current hold
-                //                 nothing.");
-                return mutable_data(holder_->type());
-            }
-
-            /**
-             * @brief     Return a pointer to mutable memory block.
-             *
-             * @param[in] dims    The dimensions of the memory block.
-             * @param[in] place   The place of the memory block.
-             *
-             * @note      If not exist, then allocation.
-             */
-            template <typename T> inline T *mutable_data(DDim dims) {
-                static_assert(std::is_pod<T>::value, "T must be POD");
-                Resize(dims);
-                return mutable_data<T>();
-            }
-
-            /*! Return the dimensions of the memory block. */
-            inline const DDim &dims() const { return dims_; }
-
-            /*! Return the numel of the memory block. */
-            inline int64_t numel() const { return product(dims_); }
-
-            /*! Resize the dimensions of the memory block. */
-            inline Tensor &Resize(const DDim &dims) {
-                dims_ = dims;
-                return *this;
-            }
-
-            /*! The internal of two tensors share the same memory block. */
-            inline Tensor &ShareDataWith(const Tensor &src) {
-                src.check_memory_size();
-                *this = src;
-                return *this;
-            }
-
-            /**
-             * @brief  Return a sub-tensor of the given tensor.
-             *
-             * @param[in] begin_idx   The index of the start row(inclusive) to
-             * slice.
-             *                        The index number begins from 0.
-             * @param[in] end_idx     The index of the end row(exclusive) to
-             * slice.
-             *                        The index number begins from 0.
-             */
-            inline Tensor Slice(int begin_idx, int end_idx) const {
-                check_memory_size();
-                //  PADDLE_ENFORCE_GE(begin_idx, 0,
-                //                    "The start row index must be greater than
-                //                    0.");
-                //  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is
-                //  out of
-                //  bound."); PADDLE_ENFORCE_LT(
-                //      begin_idx, end_idx,
-                //      "The start row index must be lesser than the end row
-                //      index.");
-
-                if (dims_[0] == 1) {
-                    return *this;
-                } else {
-                    size_t base = numel() / dims_[0];
-                    Tensor dst;
-                    dst.holder_ = holder_;
-                    dst.set_layout(layout_);
-                    DDim dst_dims = dims_;
-                    dst_dims[0] = end_idx - begin_idx;
-                    dst.Resize(dst_dims);
-                    dst.offset_ =
-                        offset_ + begin_idx * base * SizeOfType(type());
-                    return dst;
-                }
-            }
-
-            std::type_index type() const {
-                //                PADDLE_ENFORCE_NOT_NULL(
-                //                        holder_, "Tensor not initialized yet
-                //                        when
-                //                        Tensor::type() is called.");
-                return holder_->type();
-            }
-
-            // memory size returns the holding memory size in byte.
-            size_t memory_size() const {
-                return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-            }
-
-            inline void check_memory_size() const {
-                //  PADDLE_ENFORCE_NOT_NULL(
-                //      holder_, "Tensor holds no memory. Call
-                //      Tensor::mutable_data
-                //      first.");
-                //  PADDLE_ENFORCE_LE(
-                //      numel() * SizeOfType(type()), memory_size(),
-                //      "Tensor's dims_ is out of bound. Call
-                //      Tensor::mutable_data "
-                //      "first to re-allocate memory.\n"
-                //      "or maybe the required data-type mismatches the data
-                //      already
-                //      stored.");
-            }
-
-            inline DataLayout layout() const { return layout_; }
-
-            inline void set_layout(const DataLayout layout) {
-                layout_ = layout;
-            }
-
-          private:
-            /**
-             * @note    Placeholder hides type T, so it doesn't appear as a
-             * template
-             *          parameter of Variable.
-             */
-            struct Placeholder {
-                virtual ~Placeholder() = default;
-
-                virtual void *ptr() const = 0;
-
-                virtual size_t size() const = 0;
-
-                virtual std::type_index type() const = 0;
-
-                virtual void set_type(std::type_index type) = 0;
-            };
-
-            struct PlaceholderImpl : public Placeholder {
-                PlaceholderImpl(size_t size, std::type_index type)
-                    : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
-                           memory::PODDeleter<uint8_t>()),
-                      size_(size), type_(type) {
-                    //                    PADDLE_ENFORCE_NOT_NULL(ptr_,
-                    //                    "Insufficient %s
-                    //                    memory to allocation.",
-                    //                                            (is_cpu_place(place_)
-                    //                                            ?
-                    //                                            "CPU" :
-                    //                                            "GPU"));
-                }
-
-                virtual size_t size() const { return size_; }
-
-                virtual void *ptr() const {
-                    return static_cast<void *>(ptr_.get());
-                }
-
-                virtual std::type_index type() const { return type_; }
-
-                virtual void set_type(std::type_index type) { type_ = type; }
-
-                /*! the pointer of memory block. */
-                std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
-
-                /*! the size of memory block. */
-                size_t size_;
-
-                /* the current type of memory */
-                std::type_index type_;
-            };
-
-            /*! holds the memory block if allocated. */
-            std::shared_ptr<Placeholder> holder_;
-
-            /**
-             * @brief points to elements dimensions.
-             *
-             * @note dims_ do not indicate the memory block size.
-             */
-
-            DDim dims_;
-
-            /**
-             * @brief the layout of memory block, default is NHWC.
-             *
-             * @note the memory allocation order, describe how weight/data is
-             * stored
-             *       For example, in 4-D Tensor(rank=4), there are three
-             * commonly
-             *       used layout. They are
-             *            NCHW, NHWC, CHWN.
-             *       N,C,H,W for respectively the batch size, the number of
-             *       feature maps, the height, the width.
-             */
-
-            DataLayout layout_ = DataLayout::kNHWC;
-
-            /**
-             * @brief   A PlaceHolder may be shared by more than one tensor.
-             *
-             * @note    Some of them may be slices of the others. So the offset_
-             *          is introduced here to indicate the byte offset between
-             *          PlaceHolder::ptr_ and where the tensor data really
-             * begins.
-             */
-            size_t offset_;
-        };
-
-        inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
-            Tensor res;
-            res.ShareDataWith(src);
-            res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-            return res;
+    }
+};
+
+template <> struct SizeOfTypeFunctor<> {
+    size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+    size_t operator()(std::type_index type) const {
+        SizeOfTypeFunctor<HEAD> head;
+        size_t head_size = head(type);
+        if (head_size != 0) {
+            return head_size;
+        }
+        SizeOfTypeFunctor<TAIL...> tail;
+        return tail(type);
+    }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+    SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t>
+        functor;
+    size_t size = functor(type);
+    //  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s",
+    //  type.name());
+    return size;
+}
+
+class LoDTensor;
+
+class Tensor {
+  public:
+    Tensor() : offset_(0) {}
+
+    /*! Return a pointer to mutable memory block. */
+    template <typename T> inline T *data() {
+        check_memory_size();
+        //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+        //                     holder_->type().hash_code() ==
+        //                     typeid(T).hash_code(),
+        //                 "Tensor holds the wrong type, it holds %s",
+        //                 this->holder_->type().name());
+        return reinterpret_cast<T *>(
+            reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+    }
+
+    /*! Return a pointer to constant memory block. */
+    template <typename T> inline const T *data() const {
+        check_memory_size();
+        //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+        //                     holder_->type().hash_code() ==
+        //                     typeid(T).hash_code(),
+        //                 "Tensor holds the wrong type, it holds %s",
+        //                 this->holder_->type().name());
+
+        return reinterpret_cast<const T *>(
+            reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+    }
+
+    inline bool IsInitialized() const { return holder_ != nullptr; }
+
+    /**
+     * @brief   Return a pointer to mutable memory block.
+     * @note    If not exist, then allocation.
+     */
+    template <typename T> inline T *mutable_data() {
+        static_assert(std::is_pod<T>::value, "T must be POD");
+        return reinterpret_cast<T *>(mutable_data(typeid(T)));
+    }
+
+    inline void *mutable_data(std::type_index type) {
+        if (holder_ != nullptr) {
+            holder_->set_type(type);
+        }
+        //  PADDLE_ENFORCE_GE(numel(), 0,
+        //                    "When calling this method, the Tensor's
+        //                    numel must be
+        //                    " "equal or larger than zero. " "Please
+        //                    check
+        //                    Tensor::Resize has been called first.");
+        int64_t size = numel() * SizeOfType(type);
+        /* some versions of boost::variant don't have operator!= */
+        if (holder_ == nullptr || holder_->size() < size + offset_) {
+            holder_.reset(new PlaceholderImpl(size, type));
+
+            offset_ = 0;
+        }
+        return reinterpret_cast<void *>(
+            reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+    }
+
+    inline void *mutable_data() {
+        //  PADDLE_ENFORCE(this->holder_ != nullptr,
+        //                 "Cannot invoke mutable data if current hold
+        //                 nothing.");
+        return mutable_data(holder_->type());
+    }
+
+    /**
+     * @brief     Return a pointer to mutable memory block.
+     *
+     * @param[in] dims    The dimensions of the memory block.
+     * @param[in] place   The place of the memory block.
+     *
+     * @note      If not exist, then allocation.
+     */
+    template <typename T> inline T *mutable_data(DDim dims) {
+        static_assert(std::is_pod<T>::value, "T must be POD");
+        Resize(dims);
+        return mutable_data<T>();
+    }
+
+    /*! Return the dimensions of the memory block. */
+    inline const DDim &dims() const { return dims_; }
+
+    /*! Return the numel of the memory block. */
+    inline int64_t numel() const { return product(dims_); }
+
+    /*! Resize the dimensions of the memory block. */
+    inline Tensor &Resize(const DDim &dims) {
+        dims_ = dims;
+        return *this;
+    }
+
+    /*! The internal of two tensors share the same memory block. */
+    inline Tensor &ShareDataWith(const Tensor &src) {
+        src.check_memory_size();
+        *this = src;
+        return *this;
+    }
+
+    /**
+     * @brief  Return a sub-tensor of the given tensor.
+     *
+     * @param[in] begin_idx   The index of the start row(inclusive) to
+     * slice.
+     *                        The index number begins from 0.
+     * @param[in] end_idx     The index of the end row(exclusive) to
+     * slice.
+     *                        The index number begins from 0.
+     */
+    inline Tensor Slice(int begin_idx, int end_idx) const {
+        check_memory_size();
+        //  PADDLE_ENFORCE_GE(begin_idx, 0,
+        //                    "The start row index must be greater than
+        //                    0.");
+        //  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is
+        //  out of
+        //  bound."); PADDLE_ENFORCE_LT(
+        //      begin_idx, end_idx,
+        //      "The start row index must be lesser than the end row
+        //      index.");
+
+        if (dims_[0] == 1) {
+            return *this;
+        } else {
+            size_t base = numel() / dims_[0];
+            Tensor dst;
+            dst.holder_ = holder_;
+            dst.set_layout(layout_);
+            DDim dst_dims = dims_;
+            dst_dims[0] = end_idx - begin_idx;
+            dst.Resize(dst_dims);
+            dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+            return dst;
+        }
+    }
+
+    std::type_index type() const {
+        //                PADDLE_ENFORCE_NOT_NULL(
+        //                        holder_, "Tensor not initialized yet
+        //                        when
+        //                        Tensor::type() is called.");
+        return holder_->type();
+    }
+
+    // memory size returns the holding memory size in byte.
+    size_t memory_size() const {
+        return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+    }
+
+    inline void check_memory_size() const {
+        //  PADDLE_ENFORCE_NOT_NULL(
+        //      holder_, "Tensor holds no memory. Call
+        //      Tensor::mutable_data
+        //      first.");
+        //  PADDLE_ENFORCE_LE(
+        //      numel() * SizeOfType(type()), memory_size(),
+        //      "Tensor's dims_ is out of bound. Call
+        //      Tensor::mutable_data "
+        //      "first to re-allocate memory.\n"
+        //      "or maybe the required data-type mismatches the data
+        //      already
+        //      stored.");
+    }
+
+    inline DataLayout layout() const { return layout_; }
+
+    inline void set_layout(const DataLayout layout) { layout_ = layout; }
+
+  private:
+    /**
+     * @note    Placeholder hides type T, so it doesn't appear as a
+     * template
+     *          parameter of Variable.
+     */
+    struct Placeholder {
+        virtual ~Placeholder() = default;
+
+        virtual void *ptr() const = 0;
+
+        virtual size_t size() const = 0;
+
+        virtual std::type_index type() const = 0;
+
+        virtual void set_type(std::type_index type) = 0;
+    };
+
+    struct PlaceholderImpl : public Placeholder {
+        PlaceholderImpl(size_t size, std::type_index type)
+            : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
+                   memory::PODDeleter<uint8_t>()),
+              size_(size), type_(type) {
+            //                    PADDLE_ENFORCE_NOT_NULL(ptr_,
+            //                    "Insufficient %s
+            //                    memory to allocation.",
+            //                                            (is_cpu_place(place_)
+            //                                            ?
+            //                                            "CPU" :
+            //                                            "GPU"));
        }

-    } // namespace framework
+        virtual size_t size() const { return size_; }
+
+        virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
+
+        virtual std::type_index type() const { return type_; }
+
+        virtual void set_type(std::type_index type) { type_ = type; }
+
+        /*! the pointer of memory block. */
+        std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
+
+        /*! the size of memory block. */
+        size_t size_;
+
+        /* the current type of memory */
+        std::type_index type_;
+    };
+
+    /*! holds the memory block if allocated. */
+    std::shared_ptr<Placeholder> holder_;
+
+    /**
+     * @brief points to elements dimensions.
+     *
+     * @note dims_ do not indicate the memory block size.
+     */
+
+    DDim dims_;
+
+    /**
+     * @brief the layout of memory block, default is NHWC.
+     *
+     * @note the memory allocation order, describe how weight/data is
+     * stored
+     *       For example, in 4-D Tensor(rank=4), there are three
+     * commonly
+     *       used layout. They are
+     *            NCHW, NHWC, CHWN.
+     *       N,C,H,W for respectively the batch size, the number of
+     *       feature maps, the height, the width.
+     */
+
+    DataLayout layout_ = DataLayout::kNHWC;
+
+    /**
+     * @brief   A PlaceHolder may be shared by more than one tensor.
+     *
+     * @note    Some of them may be slices of the others. So the offset_
+     *          is introduced here to indicate the byte offset between
+     *          PlaceHolder::ptr_ and where the tensor data really
+     * begins.
+     */
+    size_t offset_;
+};
+
+inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
+    Tensor res;
+    res.ShareDataWith(src);
+    res.Resize(flatten_to_2d(src.dims(), num_col_dims));
+    return res;
+}
+
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/tensor_util.cc
+++ b/src/framework/tensor_util.cc
@@ -18,189 +18,187 @@
 #include <vector>

 namespace paddle_mobile {
-    namespace framework {
-
-        void TensorCopy(const Tensor &src, Tensor *dst) {
-            //  VLOG(3) << "TensorCopy " << src.dims() << " from " <<
-            //  src.place() << " to
-            //  "
-            //          << dst_place;
-            src.check_memory_size();
-
-            dst->Resize(src.dims());
-            dst->set_layout(src.layout());
-            auto src_ptr = src.data<void>();
-
-            auto dst_ptr = dst->mutable_data(src.type());
-
-            auto size = src.numel() * SizeOfType(src.type());
-
-            memory::Copy(dst_ptr, src_ptr, size);
-        }
-
-        void TensorCopySync(const Tensor &src, Tensor *dst) {
-            //  VLOG(3) << "TensorCopySync " << src.dims() << " from " <<
-            //  src.place()
-            //          << " to " << dst_place;
-            src.check_memory_size();
-            dst->Resize(src.dims());
-            dst->set_layout(src.layout());
-            auto src_ptr = src.data<void>();
-            auto dst_ptr = dst->mutable_data(src.type());
-            auto size = src.numel() * SizeOfType(src.type());
-            memory::Copy(dst_ptr, src_ptr, size);
-        }
-
-        template <typename Predicate> struct AnyDTypeVisitor {
-            Predicate predicate_;
-            const Tensor &tensor_;
-            Tensor *out_;
-
-            AnyDTypeVisitor(Predicate predicate, const Tensor &tensor,
-                            Tensor *out)
-                : predicate_(predicate), tensor_(tensor), out_(out) {}
-
-            template <typename T> void operator()() const {
-                //    auto t = EigenVector<T>::Flatten(tensor_);
-                //    auto o = EigenScalar<bool>::From(*out_);
-                // return any of predicate_(t) is true.
-                //    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-            }
-        };
-
-        template <typename Predicate>
-        inline void AnyImpl(Predicate predicate, const Tensor &tensor,
-                            framework::Tensor *out) {
-            VisitDataType(ToDataType(tensor.type()),
-                          AnyDTypeVisitor<Predicate>(predicate, tensor, out));
-        }
-
-        template <typename Predicate> struct AnyVisitor {
-            const framework::Tensor &tensor_;
-            Predicate predicate_;
-
-            AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
-                : tensor_(tensor), predicate_(std::move(predicate)) {}
-
-            bool operator()(void) const {
-                framework::Tensor out;
-                out.Resize({1});
-                out.mutable_data<bool>();
-                AnyImpl(predicate_, tensor_, &out);
-                return this->GetResult(out);
-            }
-
-            bool GetResult(const framework::Tensor &out) const {
-                return *out.data<bool>();
-            }
-        };
-
-        template <typename Predicate>
-        inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
-            AnyVisitor<Predicate> visitor(tensor, predicate);
-            //  return platform::VisitPlace(visitor);
-            return visitor();
-        }
-
-        struct ContainsNANPredicate {
-            template <typename T>
-            auto operator()(const T &eigen_vec) const
-                -> decltype(std::declval<T>().isnan()) {
-                // Cast eigen_vector to vector of bool. true if is inf.
-                return eigen_vec.isnan();
-            }
-        };
-
-        bool TensorContainsNAN(const framework::Tensor &tensor) {
-            ContainsNANPredicate predicate;
-            return Any(tensor, predicate);
-        }
-
-        struct ContainsInfPredicate {
-            template <typename T>
-            auto operator()(const T &eigen_vec) const
-                -> decltype(std::declval<T>().isinf()) {
-                // Cast eigen_vector to vector of bool. true if is inf.
-                return eigen_vec.isinf();
-            }
-        };
-
-        bool TensorContainsInf(const framework::Tensor &tensor) {
-            ContainsInfPredicate predicate;
-            return Any(tensor, predicate);
-        }
-
-        void TensorToStream(std::ostream &os, const Tensor &tensor) {
-            { // the 1st field, uint32_t version
-                constexpr uint32_t version = 0;
-                os.write(reinterpret_cast<const char *>(&version),
-                         sizeof(version));
-            }
-            { // the 2nd field, tensor description
-                // int32_t  size
-                // void*    protobuf message
-                proto::VarType::TensorDesc desc;
-                desc.set_data_type(framework::ToDataType(tensor.type()));
-                auto dims = framework::vectorize(tensor.dims());
-                auto *pb_dims = desc.mutable_dims();
-                pb_dims->Resize(static_cast<int>(dims.size()), 0);
-                std::copy(dims.begin(), dims.end(), pb_dims->begin());
-                int32_t size = desc.ByteSize();
-                os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-                auto out = desc.SerializeAsString();
-                os.write(out.data(), size);
-            }
-            { // the 3rd field, tensor data
-                uint64_t size = tensor.memory_size();
-                auto *data_ptr = tensor.data<void>();
-                //    PADDLE_ENFORCE(size <
-                //    std::numeric_limits<std::streamsize>::max(),
-                //                   "Index overflow when writing tensor");
-
-                os.write(static_cast<const char *>(data_ptr),
-                         static_cast<std::streamsize>(size));
-            }
-        }
-
-        struct DeserializedDataFunctor {
-            DeserializedDataFunctor(void **buf, Tensor *tensor)
-                : buf_(buf), tensor_(tensor) {}
-
-            template <typename T> void operator()() {
-                *buf_ = tensor_->mutable_data<T>();
-            }
-
-            void **buf_;
-            Tensor *tensor_;
-        };
-
-        void TensorFromStream(std::istream &is, framework::Tensor *tensor) {
-            uint32_t version;
-            is.read(reinterpret_cast<char *>(&version), sizeof(version));
-            //  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-            proto::VarType::TensorDesc desc;
-            { // int32_t size
-                // proto buffer
-                int32_t size;
-                is.read(reinterpret_cast<char *>(&size), sizeof(size));
-                std::unique_ptr<char[]> buf(new char[size]);
-                is.read(reinterpret_cast<char *>(buf.get()), size);
-                //    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                //                   "Cannot parse tensor desc");
-            }
-            { // read tensor
-                std::vector<int64_t> dims;
-                dims.reserve(static_cast<size_t>(desc.dims().size()));
-                std::copy(desc.dims().begin(), desc.dims().end(),
-                          std::back_inserter(dims));
-                tensor->Resize(framework::make_ddim(dims));
-                void *buf;
-
-                framework::VisitDataType(desc.data_type(),
-                                         DeserializedDataFunctor(&buf, tensor));
-                is.read(static_cast<char *>(buf), tensor->memory_size());
-            }
-        }
-
-    } // namespace framework
+namespace framework {
+
+void TensorCopy(const Tensor &src, Tensor *dst) {
+    //  VLOG(3) << "TensorCopy " << src.dims() << " from " <<
+    //  src.place() << " to
+    //  "
+    //          << dst_place;
+    src.check_memory_size();
+
+    dst->Resize(src.dims());
+    dst->set_layout(src.layout());
+    auto src_ptr = src.data<void>();
+
+    auto dst_ptr = dst->mutable_data(src.type());
+
+    auto size = src.numel() * SizeOfType(src.type());
+
+    memory::Copy(dst_ptr, src_ptr, size);
+}
+
+void TensorCopySync(const Tensor &src, Tensor *dst) {
+    //  VLOG(3) << "TensorCopySync " << src.dims() << " from " <<
+    //  src.place()
+    //          << " to " << dst_place;
+    src.check_memory_size();
+    dst->Resize(src.dims());
+    dst->set_layout(src.layout());
+    auto src_ptr = src.data<void>();
+    auto dst_ptr = dst->mutable_data(src.type());
+    auto size = src.numel() * SizeOfType(src.type());
+    memory::Copy(dst_ptr, src_ptr, size);
+}
+
+template <typename Predicate> struct AnyDTypeVisitor {
+    Predicate predicate_;
+    const Tensor &tensor_;
+    Tensor *out_;
+
+    AnyDTypeVisitor(Predicate predicate, const Tensor &tensor, Tensor *out)
+        : predicate_(predicate), tensor_(tensor), out_(out) {}
+
+    template <typename T> void operator()() const {
+        //    auto t = EigenVector<T>::Flatten(tensor_);
+        //    auto o = EigenScalar<bool>::From(*out_);
+        // return any of predicate_(t) is true.
+        //    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+    }
+};
+
+template <typename Predicate>
+inline void AnyImpl(Predicate predicate, const Tensor &tensor,
+                    framework::Tensor *out) {
+    VisitDataType(ToDataType(tensor.type()),
+                  AnyDTypeVisitor<Predicate>(predicate, tensor, out));
+}
+
+template <typename Predicate> struct AnyVisitor {
+    const framework::Tensor &tensor_;
+    Predicate predicate_;
+
+    AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
+        : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+    bool operator()(void) const {
+        framework::Tensor out;
+        out.Resize({1});
+        out.mutable_data<bool>();
+        AnyImpl(predicate_, tensor_, &out);
+        return this->GetResult(out);
+    }
+
+    bool GetResult(const framework::Tensor &out) const {
+        return *out.data<bool>();
+    }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
+    AnyVisitor<Predicate> visitor(tensor, predicate);
+    //  return platform::VisitPlace(visitor);
+    return visitor();
+}
+
+struct ContainsNANPredicate {
+    template <typename T>
+    auto operator()(const T &eigen_vec) const
+        -> decltype(std::declval<T>().isnan()) {
+        // Cast eigen_vector to vector of bool. true if is inf.
+        return eigen_vec.isnan();
+    }
+};
+
+bool TensorContainsNAN(const framework::Tensor &tensor) {
+    ContainsNANPredicate predicate;
+    return Any(tensor, predicate);
+}
+
+struct ContainsInfPredicate {
+    template <typename T>
+    auto operator()(const T &eigen_vec) const
+        -> decltype(std::declval<T>().isinf()) {
+        // Cast eigen_vector to vector of bool. true if is inf.
+        return eigen_vec.isinf();
+    }
+};
+
+bool TensorContainsInf(const framework::Tensor &tensor) {
+    ContainsInfPredicate predicate;
+    return Any(tensor, predicate);
+}
+
+void TensorToStream(std::ostream &os, const Tensor &tensor) {
+    { // the 1st field, uint32_t version
+        constexpr uint32_t version = 0;
+        os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+    { // the 2nd field, tensor description
+        // int32_t  size
+        // void*    protobuf message
+        proto::VarType::TensorDesc desc;
+        desc.set_data_type(framework::ToDataType(tensor.type()));
+        auto dims = framework::vectorize(tensor.dims());
+        auto *pb_dims = desc.mutable_dims();
+        pb_dims->Resize(static_cast<int>(dims.size()), 0);
+        std::copy(dims.begin(), dims.end(), pb_dims->begin());
+        int32_t size = desc.ByteSize();
+        os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+        auto out = desc.SerializeAsString();
+        os.write(out.data(), size);
+    }
+    { // the 3rd field, tensor data
+        uint64_t size = tensor.memory_size();
+        auto *data_ptr = tensor.data<void>();
+        //    PADDLE_ENFORCE(size <
+        //    std::numeric_limits<std::streamsize>::max(),
+        //                   "Index overflow when writing tensor");
+
+        os.write(static_cast<const char *>(data_ptr),
+                 static_cast<std::streamsize>(size));
+    }
+}
+
+struct DeserializedDataFunctor {
+    DeserializedDataFunctor(void **buf, Tensor *tensor)
+        : buf_(buf), tensor_(tensor) {}
+
+    template <typename T> void operator()() {
+        *buf_ = tensor_->mutable_data<T>();
+    }
+
+    void **buf_;
+    Tensor *tensor_;
+};
+
+void TensorFromStream(std::istream &is, framework::Tensor *tensor) {
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    //  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    proto::VarType::TensorDesc desc;
+    { // int32_t size
+        // proto buffer
+        int32_t size;
+        is.read(reinterpret_cast<char *>(&size), sizeof(size));
+        std::unique_ptr<char[]> buf(new char[size]);
+        is.read(reinterpret_cast<char *>(buf.get()), size);
+        //    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+        //                   "Cannot parse tensor desc");
+    }
+    { // read tensor
+        std::vector<int64_t> dims;
+        dims.reserve(static_cast<size_t>(desc.dims().size()));
+        std::copy(desc.dims().begin(), desc.dims().end(),
+                  std::back_inserter(dims));
+        tensor->Resize(framework::make_ddim(dims));
+        void *buf;
+
+        framework::VisitDataType(desc.data_type(),
+                                 DeserializedDataFunctor(&buf, tensor));
+        is.read(static_cast<char *>(buf), tensor->memory_size());
+    }
+}
+
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/tensor_util.h
+++ b/src/framework/tensor_util.h
@@ -20,47 +20,47 @@ limitations under the License. */
 #include <vector>

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        void TensorCopy(const Tensor &src, Tensor *dst);
-        void TensorCopySync(const Tensor &src, Tensor *dst);
+void TensorCopy(const Tensor &src, Tensor *dst);
+void TensorCopySync(const Tensor &src, Tensor *dst);

-        template <typename T>
-        void TensorFromVector(const std::vector<T> &src, Tensor *dst);
+template <typename T>
+void TensorFromVector(const std::vector<T> &src, Tensor *dst);

-        template <typename T>
-        void TesnorToVector(const Tensor &src, std::vector<T> *dst);
+template <typename T>
+void TesnorToVector(const Tensor &src, std::vector<T> *dst);

-        bool TensorContainsNAN(const framework::Tensor &tensor);
-        bool TensorContainsInf(const framework::Tensor &tensor);
+bool TensorContainsNAN(const framework::Tensor &tensor);
+bool TensorContainsInf(const framework::Tensor &tensor);

-        void TensorToStream(std::ostream &os, const Tensor &tensor);
-        void TensorFromStream(std::istream &is, Tensor *tensor);
+void TensorToStream(std::ostream &os, const Tensor &tensor);
+void TensorFromStream(std::istream &is, Tensor *tensor);

-        //
-        // The implementation of template functions.
-        //
+//
+// The implementation of template functions.
+//

-        template <typename T>
-        void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
-            auto src_ptr = static_cast<const void *>(src.data());
-            dst->Resize({static_cast<int64_t>(src.size())});
-            auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
-            auto size = src.size() * sizeof(T);
+template <typename T>
+void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
+    auto src_ptr = static_cast<const void *>(src.data());
+    dst->Resize({static_cast<int64_t>(src.size())});
+    auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
+    auto size = src.size() * sizeof(T);

-            memory::Copy(dst_ptr, src_ptr, size);
-        }
+    memory::Copy(dst_ptr, src_ptr, size);
+}

-        template <typename T>
-        void TensorToVector(const Tensor &src, std::vector<T> *dst) {
-            auto src_ptr = static_cast<const void *>(src.data<T>());
-            auto size = src.numel() * sizeof(T);
+template <typename T>
+void TensorToVector(const Tensor &src, std::vector<T> *dst) {
+    auto src_ptr = static_cast<const void *>(src.data<T>());
+    auto size = src.numel() * sizeof(T);

-            dst->resize(src.numel());
-            auto dst_ptr = static_cast<void *>(dst->data());
+    dst->resize(src.numel());
+    auto dst_ptr = static_cast<void *>(dst->data());

-            memory::Copy(dst_ptr, src_ptr, size);
-        }
+    memory::Copy(dst_ptr, src_ptr, size);
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/var_desc.cpp
+++ b/src/framework/var_desc.cpp
@@ -20,9 +20,9 @@ SOFTWARE.

 namespace paddle_mobile {

-    namespace framework {
+namespace framework {

-        VarDesc::VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
+VarDesc::VarDesc(const proto::VarDesc &desc) : desc_(desc) {}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/var_desc.h
+++ b/src/framework/var_desc.h
@@ -22,68 +22,67 @@ SOFTWARE.
 #include "paddle_mobile_object.h"

 namespace paddle_mobile {
-    namespace framework {
+namespace framework {

-        class VarDesc {
-          public:
-            VarDesc(const proto::VarDesc &desc);
+class VarDesc {
+  public:
+    VarDesc(const proto::VarDesc &desc);

-            std::string Name() const { return desc_.name(); }
+    std::string Name() const { return desc_.name(); }

-            proto::VarType::Type GetType() const { return desc_.type().type(); }
+    proto::VarType::Type GetType() const { return desc_.type().type(); }

-            bool Persistable() const { return desc_.persistable(); }
+    bool Persistable() const { return desc_.persistable(); }

-            const proto::VarType::ChannelDesc &channel_desc() const {
-                switch (desc_.type().type()) {
-                case proto::VarType::CHANNEL:
-                    return desc_.type().channel();
-                default:
-                    break;
-                }
-            }
+    const proto::VarType::ChannelDesc &channel_desc() const {
+        switch (desc_.type().type()) {
+        case proto::VarType::CHANNEL:
+            return desc_.type().channel();
+        default:
+            break;
+        }
+    }

-            const proto::VarType::TensorDesc &tensor_desc() const {
-                switch (desc_.type().type()) {
-                case proto::VarType::SELECTED_ROWS:
-                    return desc_.type().selected_rows();
-                case proto::VarType::LOD_TENSOR:
-                    return desc_.type().lod_tensor().tensor();
-                case proto::VarType::LOD_TENSOR_ARRAY:
-                    return desc_.type().tensor_array().tensor();
-                default:
-                    break;
-                }
-            }
+    const proto::VarType::TensorDesc &tensor_desc() const {
+        switch (desc_.type().type()) {
+        case proto::VarType::SELECTED_ROWS:
+            return desc_.type().selected_rows();
+        case proto::VarType::LOD_TENSOR:
+            return desc_.type().lod_tensor().tensor();
+        case proto::VarType::LOD_TENSOR_ARRAY:
+            return desc_.type().tensor_array().tensor();
+        default:
+            break;
+        }
+    }

-            proto::VarType::Type GetDataType() const {
-                switch (desc_.type().type()) {
-                case proto::VarType::CHANNEL:
-                    return channel_desc().data_type();
-                    break;
-                default:
-                    return tensor_desc().data_type();
-                }
-            }
+    proto::VarType::Type GetDataType() const {
+        switch (desc_.type().type()) {
+        case proto::VarType::CHANNEL:
+            return channel_desc().data_type();
+            break;
+        default:
+            return tensor_desc().data_type();
+        }
+    }

-            template <typename T>
-            std::vector<T> RepeatedToVector(
-                const google::protobuf::RepeatedField<T> &repeated_field)
-                const {
-                std::vector<T> ret;
-                ret.reserve(repeated_field.size());
-                std::copy(repeated_field.begin(), repeated_field.end(),
-                          std::back_inserter(ret));
-                return ret;
-            }
+    template <typename T>
+    std::vector<T> RepeatedToVector(
+        const google::protobuf::RepeatedField<T> &repeated_field) const {
+        std::vector<T> ret;
+        ret.reserve(repeated_field.size());
+        std::copy(repeated_field.begin(), repeated_field.end(),
+                  std::back_inserter(ret));
+        return ret;
+    }

-            std::vector<int64_t> GetShape() const {
-                return this->RepeatedToVector(tensor_desc().dims());
-            }
+    std::vector<int64_t> GetShape() const {
+        return this->RepeatedToVector(tensor_desc().dims());
+    }

-          private:
-            proto::VarDesc desc_;
-        };
+  private:
+    proto::VarDesc desc_;
+};

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/var_type.h
+++ b/src/framework/var_type.h
@@ -23,17 +23,17 @@ SOFTWARE.
 #include "variable.h"

 namespace paddle_mobile {
-    namespace framework {
-        inline proto::VarType::Type ToVarType(std::type_index type) {
-            if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-                return proto::VarType_Type_LOD_TENSOR;
-            } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-                return proto::VarType_Type_SELECTED_ROWS;
-            } else {
-                //    PADDLE_THROW("ToVarType:Unsupported type %s",
-                //    type.name());
-            }
-        }
+namespace framework {
+inline proto::VarType::Type ToVarType(std::type_index type) {
+    if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+        return proto::VarType_Type_LOD_TENSOR;
+    } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+        return proto::VarType_Type_SELECTED_ROWS;
+    } else {
+        //    PADDLE_THROW("ToVarType:Unsupported type %s",
+        //    type.name());
+    }
+}

-    } // namespace framework
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -26,71 +26,71 @@ SOFTWARE.
 #include <typeinfo>

 namespace paddle_mobile {
-    namespace framework {
-        class Variable : public PaddleMobileObject {
-          public:
-            template <typename T> const T *Get() const {
-                return static_cast<const T *>(holder_->Ptr());
+namespace framework {
+class Variable : public PaddleMobileObject {
+  public:
+    template <typename T> const T *Get() const {
+        return static_cast<const T *>(holder_->Ptr());
+    }
+
+    bool IsInitialized() const { return holder_ != nullptr; }
+
+    const std::string *Name() { return name_; }
+
+    template <typename T> T *GetMutable() {
+        if (!IsType<T>()) {
+            if (*Name() == "pixel") {
+                //        std::cout << " reset " << *Name() <<
+                //        std::endl;
            }
-
-            bool IsInitialized() const { return holder_ != nullptr; }
-
-            const std::string *Name() { return name_; }
-
-            template <typename T> T *GetMutable() {
-                if (!IsType<T>()) {
-                    if (*Name() == "pixel") {
-                        //        std::cout << " reset " << *Name() <<
-                        //        std::endl;
-                    }
-                    holder_.reset(new PlaceholderImp<T>(new T()));
-                }
-                return static_cast<T *>(holder_->Ptr());
-            }
-
-            template <typename T> bool IsType() const {
-                if (holder_) {
-                    //                printf("not null \n");
-                    printf(" holder type : %s, this type %s \n",
-                           holder_->Type().name(), typeid(T).name());
-                }
-
-                //              std::cout << " " << holder_->Type() << " " <<
-                //              typeid(T) <<
-                //              std::endl;
-                return holder_ != nullptr && holder_->Type() == typeid(T);
-            }
-
-            void Clear() { holder_.reset(); }
-
-            std::type_index Type() const { return holder_->Type(); }
-
-            void SetName(const std::string *name) { name_ = name; }
-
-          private:
-            struct Placeholder {
-                Placeholder() = default;
-                virtual ~Placeholder() = default;
-
-                virtual const std::type_info &Type() const = 0;
-                virtual void *Ptr() const = 0;
-            };
-
-            template <typename T> struct PlaceholderImp : public Placeholder {
-                explicit PlaceholderImp(T *ptr) : ptr_(ptr), type_(typeid(T)) {}
-
-                virtual const std::type_info &Type() const { return type_; }
-                virtual void *Ptr() const override {
-                    return static_cast<void *>(ptr_.get());
-                }
-
-                std::unique_ptr<T> ptr_;
-                const std::type_info &type_;
-            };
-
-            std::unique_ptr<Placeholder> holder_;
-            friend class Scope;
-            const std::string *name_;
-        };
-    } // namespace framework
+            holder_.reset(new PlaceholderImp<T>(new T()));
+        }
+        return static_cast<T *>(holder_->Ptr());
+    }
+
+    template <typename T> bool IsType() const {
+        if (holder_) {
+            //                printf("not null \n");
+            printf(" holder type : %s, this type %s \n", holder_->Type().name(),
+                   typeid(T).name());
+        }
+
+        //              std::cout << " " << holder_->Type() << " " <<
+        //              typeid(T) <<
+        //              std::endl;
+        return holder_ != nullptr && holder_->Type() == typeid(T);
+    }
+
+    void Clear() { holder_.reset(); }
+
+    std::type_index Type() const { return holder_->Type(); }
+
+    void SetName(const std::string *name) { name_ = name; }
+
+  private:
+    struct Placeholder {
+        Placeholder() = default;
+        virtual ~Placeholder() = default;
+
+        virtual const std::type_info &Type() const = 0;
+        virtual void *Ptr() const = 0;
+    };
+
+    template <typename T> struct PlaceholderImp : public Placeholder {
+        explicit PlaceholderImp(T *ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+        virtual const std::type_info &Type() const { return type_; }
+        virtual void *Ptr() const override {
+            return static_cast<void *>(ptr_.get());
+        }
+
+        std::unique_ptr<T> ptr_;
+        const std::type_info &type_;
+    };
+
+    std::unique_ptr<Placeholder> holder_;
+    friend class Scope;
+    const std::string *name_;
+};
+} // namespace framework
 } // namespace paddle_mobile
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -28,398 +28,386 @@ SOFTWARE.

 namespace paddle_mobile {

-    void ReadBinaryFile(const std::string &filename, std::string *contents) {
-        std::ifstream fin(filename, std::ios::in | std::ios::binary);
-        fin.seekg(0, std::ios::end);
-        contents->clear();
-        contents->resize(fin.tellg());
-        fin.seekg(0, std::ios::beg);
-        fin.read(&(contents->at(0)), contents->size());
-        fin.close();
-    }
-
-    template <typename Dtype, Precision P>
-    void Loader<Dtype, P>::LoadVar(framework::LoDTensor *tensor,
-                                   const std::string &file_path) {
-        //        LOG(kLOG_DEBUG) << "  to load " << file_path;
-        //  Log(kLOG_DEBUG) << "123";
-
-        std::ifstream is(file_path);
-
-        std::streampos pos = is.tellg(); //   save   current   position
-        is.seekg(0, std::ios::end);
-        //        LOG(kLOG_DEBUG) << "  file length = " << is.tellg();
-        is.seekg(pos); //   restore   saved   position
-
-        // 1. version
-        uint32_t version;
-        is.read(reinterpret_cast<char *>(&version), sizeof(version));
-        //        LOG(kLOG_INFO) << "   version: " << version;
-
-        // 2 Lod information
-        uint64_t lod_level;
-        is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-        //        LOG(kLOG_DEBUG) << "   load level: " << lod_level;
-        //        LOG(kLOG_DEBUG) << "   lod info: ";
-        auto &lod = *tensor->mutable_lod();
-        lod.resize(lod_level);
-        for (uint64_t i = 0; i < lod_level; ++i) {
-            uint64_t size;
-            is.read(reinterpret_cast<char *>(&size), sizeof(size));
-            std::vector<size_t> tmp(size / sizeof(size_t));
-            is.read(reinterpret_cast<char *>(tmp.data()),
-                    static_cast<std::streamsize>(size));
-            for (int j = 0; j < tmp.size(); ++j) {
-                LOG(kLOG_DEBUG1) << "    lod - " << tmp[j];
-            }
-            lod[i] = tmp;
-        }
-
-        // 3. tensor version
-        uint32_t tensor_version;
-        is.read(reinterpret_cast<char *>(&tensor_version),
-                sizeof(tensor_version));
-        //  std::cout << "   tensor_version: " << tensor_version << std::endl;
-
-        // 4. tensor desc
-        int32_t size;
+void ReadBinaryFile(const std::string &filename, std::string *contents) {
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    fin.seekg(0, std::ios::end);
+    contents->clear();
+    contents->resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(contents->at(0)), contents->size());
+    fin.close();
+}
+
+template <typename Dtype, Precision P>
+void Loader<Dtype, P>::LoadVar(framework::LoDTensor *tensor,
+                               const std::string &file_path) {
+    //        LOG(kLOG_DEBUG) << "  to load " << file_path;
+    //  Log(kLOG_DEBUG) << "123";
+
+    std::ifstream is(file_path);
+
+    std::streampos pos = is.tellg(); //   save   current   position
+    is.seekg(0, std::ios::end);
+    //        LOG(kLOG_DEBUG) << "  file length = " << is.tellg();
+    is.seekg(pos); //   restore   saved   position
+
+    // 1. version
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    //        LOG(kLOG_INFO) << "   version: " << version;
+
+    // 2 Lod information
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    //        LOG(kLOG_DEBUG) << "   load level: " << lod_level;
+    //        LOG(kLOG_DEBUG) << "   lod info: ";
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+        uint64_t size;
        is.read(reinterpret_cast<char *>(&size), sizeof(size));
-        //  std::cout << "   tensor desc size: " << size << std::endl;
-        std::unique_ptr<char[]> buf(new char[size]);
-        is.read(reinterpret_cast<char *>(buf.get()), size);
-
-        framework::proto::VarType::TensorDesc desc;
-        desc.ParseFromArray(buf.get(), size);
-
-        //  std::cout << "   desc dims size " << desc.dims().size() <<
-        //  std::endl;
-        int memory_size = 1;
-        for (int l = 0; l < desc.dims().size(); ++l) {
-            //    std::cout << "    dim " << l << " value: " << desc.dims()[l]
-            //    <<
-            //    std::endl;
-            memory_size *= desc.dims()[l];
+        std::vector<size_t> tmp(size / sizeof(size_t));
+        is.read(reinterpret_cast<char *>(tmp.data()),
+                static_cast<std::streamsize>(size));
+        for (int j = 0; j < tmp.size(); ++j) {
+            LOG(kLOG_DEBUG1) << "    lod - " << tmp[j];
        }
+        lod[i] = tmp;
+    }

-        std::vector<int64_t> dims;
-        dims.reserve(static_cast<size_t>(desc.dims().size()));
-        std::copy(desc.dims().begin(), desc.dims().end(),
-                  std::back_inserter(dims));
-        tensor->Resize(framework::make_ddim(dims));
-
-        void *memory;
-        int type_size = 0;
-        //  std::cout << "    desc pre type: ";
-        switch (desc.data_type()) {
-        case framework::proto::VarType::FP16:
-            //      std::cout << "FP16" << std::endl;
-            type_size = 2;
-            break;
-        case framework::proto::VarType::FP32:
-            type_size = 4;
-            memory = tensor->mutable_data<float>();
-            //      std::cout << "FP32" << std::endl;
-            break;
-        case framework::proto::VarType::FP64:
-            type_size = 8;
-            //      std::cout << "FP64" << std::endl;
-            break;
-        case framework::proto::VarType::INT32:
-            type_size = 4;
-            //      std::cout << "INT32" << std::endl;
-            break;
-        case framework::proto::VarType::INT64:
-            type_size = 8;
-            //      std::cout << "INT64" << std::endl;
-            break;
-        case framework::proto::VarType::BOOL:
-            type_size = 1;
-            //      std::cout << "BOOL" << std::endl;
-            break;
-        default:
-            break;
-            //      std::cout << "    not support" << std::endl;
-        }
+    // 3. tensor version
+    uint32_t tensor_version;
+    is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
+    //  std::cout << "   tensor_version: " << tensor_version << std::endl;
+
+    // 4. tensor desc
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    //  std::cout << "   tensor desc size: " << size << std::endl;
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+
+    framework::proto::VarType::TensorDesc desc;
+    desc.ParseFromArray(buf.get(), size);
+
+    //  std::cout << "   desc dims size " << desc.dims().size() <<
+    //  std::endl;
+    int memory_size = 1;
+    for (int l = 0; l < desc.dims().size(); ++l) {
+        //    std::cout << "    dim " << l << " value: " << desc.dims()[l]
+        //    <<
+        //    std::endl;
+        memory_size *= desc.dims()[l];
+    }

-        //  std::cout << "    malloc size: " << memory_size * type_size <<
-        //  std::endl;
-        is.read(static_cast<char *>(memory), memory_size * type_size);
-        //  std::cout << "    memory: " << memory << std::endl;
-        is.close();
-    };
-
-    template <typename Dtype, Precision P>
-    const framework::Program<Dtype, P>
-    Loader<Dtype, P>::Load(const std::string &dirname) {
-        std::string model_filename = dirname + "/__model__";
-        std::string program_desc_str;
-        ReadBinaryFile(model_filename, &program_desc_str);
-        framework::proto::ProgramDesc program_desc_proto;
-        program_desc_proto.ParseFromString(program_desc_str);
-
-        std::shared_ptr<framework::ProgramDesc> originProgramDesc =
-            std::make_shared<framework::ProgramDesc>(program_desc_proto);
-
-        framework::Program<Dtype, P> program;
-        program.originProgram = originProgramDesc;
-
-        std::shared_ptr<framework::Scope> scope =
-            std::make_shared<framework::Scope>();
-        program.scope = scope;
-
-        auto block = originProgramDesc->Block(0);
-
-        for (auto block : originProgramDesc->Blocks()) {
-            //    std::cout << "for block" << std::endl;
-            for (int i = 0; i < block->Vars().size(); ++i) {
-                std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
-                auto var = scope->Var(var_desc->Name());
-                if (var_desc->GetType() ==
-                    framework::proto::VarType::LOD_TENSOR) {
-                    if (var_desc->Persistable() &&
-                        var_desc->GetType() !=
-                            framework::proto::VarType::FEED_MINIBATCH &&
-                        var_desc->GetType() !=
-                            framework::proto::VarType::FETCH_LIST) {
-                        framework::LoDTensor *tensor =
-                            var->GetMutable<framework::LoDTensor>();
-                        // to load
-                        LoadVar(tensor, dirname + "/" + var_desc->Name());
-                    }
-                } else {
-                    //        std::cout << "非 lod" << std::endl;
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *memory;
+    int type_size = 0;
+    //  std::cout << "    desc pre type: ";
+    switch (desc.data_type()) {
+    case framework::proto::VarType::FP16:
+        //      std::cout << "FP16" << std::endl;
+        type_size = 2;
+        break;
+    case framework::proto::VarType::FP32:
+        type_size = 4;
+        memory = tensor->mutable_data<float>();
+        //      std::cout << "FP32" << std::endl;
+        break;
+    case framework::proto::VarType::FP64:
+        type_size = 8;
+        //      std::cout << "FP64" << std::endl;
+        break;
+    case framework::proto::VarType::INT32:
+        type_size = 4;
+        //      std::cout << "INT32" << std::endl;
+        break;
+    case framework::proto::VarType::INT64:
+        type_size = 8;
+        //      std::cout << "INT64" << std::endl;
+        break;
+    case framework::proto::VarType::BOOL:
+        type_size = 1;
+        //      std::cout << "BOOL" << std::endl;
+        break;
+    default:
+        break;
+        //      std::cout << "    not support" << std::endl;
+    }
+
+    //  std::cout << "    malloc size: " << memory_size * type_size <<
+    //  std::endl;
+    is.read(static_cast<char *>(memory), memory_size * type_size);
+    //  std::cout << "    memory: " << memory << std::endl;
+    is.close();
+};
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P>
+Loader<Dtype, P>::Load(const std::string &dirname) {
+    std::string model_filename = dirname + "/__model__";
+    std::string program_desc_str;
+    ReadBinaryFile(model_filename, &program_desc_str);
+    framework::proto::ProgramDesc program_desc_proto;
+    program_desc_proto.ParseFromString(program_desc_str);
+
+    std::shared_ptr<framework::ProgramDesc> originProgramDesc =
+        std::make_shared<framework::ProgramDesc>(program_desc_proto);
+
+    framework::Program<Dtype, P> program;
+    program.originProgram = originProgramDesc;
+
+    std::shared_ptr<framework::Scope> scope =
+        std::make_shared<framework::Scope>();
+    program.scope = scope;
+
+    auto block = originProgramDesc->Block(0);
+
+    for (auto block : originProgramDesc->Blocks()) {
+        //    std::cout << "for block" << std::endl;
+        for (int i = 0; i < block->Vars().size(); ++i) {
+            std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
+            auto var = scope->Var(var_desc->Name());
+            if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+                if (var_desc->Persistable() &&
+                    var_desc->GetType() !=
+                        framework::proto::VarType::FEED_MINIBATCH &&
+                    var_desc->GetType() !=
+                        framework::proto::VarType::FETCH_LIST) {
+                    framework::LoDTensor *tensor =
+                        var->GetMutable<framework::LoDTensor>();
+                    // to load
+                    LoadVar(tensor, dirname + "/" + var_desc->Name());
                }
+            } else {
+                //        std::cout << "非 lod" << std::endl;
            }
        }
+    }

 #ifdef PADDLE_MOBILE_DEBUG
-        for (int i = 0; i < program_desc_proto.blocks().size(); ++i) {
-            framework::proto::BlockDesc block = program_desc_proto.blocks()[i];
-            LOG(kLOG_DEBUG) << "block: " << block.idx();
-            for (int j = 0; j < block.ops().size(); ++j) {
-                framework::proto::OpDesc op = block.ops()[j];
-                LOG(kLOG_DEBUG1) << " op: " << op.type();
-                for (int m = 0; m < op.inputs_size(); ++m) {
-                    const framework::proto::OpDesc::Var &var = op.inputs(m);
-                    LOG(kLOG_DEBUG2) << "  input parameter: "
-                                     << var.parameter();
-                    for (int n = 0; n < var.arguments().size(); ++n) {
-                        LOG(kLOG_DEBUG3) << "   argument - "
-                                         << var.arguments()[n];
-                    }
+    for (int i = 0; i < program_desc_proto.blocks().size(); ++i) {
+        framework::proto::BlockDesc block = program_desc_proto.blocks()[i];
+        LOG(kLOG_DEBUG) << "block: " << block.idx();
+        for (int j = 0; j < block.ops().size(); ++j) {
+            framework::proto::OpDesc op = block.ops()[j];
+            LOG(kLOG_DEBUG1) << " op: " << op.type();
+            for (int m = 0; m < op.inputs_size(); ++m) {
+                const framework::proto::OpDesc::Var &var = op.inputs(m);
+                LOG(kLOG_DEBUG2) << "  input parameter: " << var.parameter();
+                for (int n = 0; n < var.arguments().size(); ++n) {
+                    LOG(kLOG_DEBUG3) << "   argument - " << var.arguments()[n];
                }
+            }

-                for (int y = 0; y < op.outputs_size(); ++y) {
-                    const framework::proto::OpDesc::Var &var = op.outputs(y);
-                    LOG(kLOG_DEBUG2) << "  out parameter: " << var.parameter();
-                    for (int z = 0; z < var.arguments().size(); ++z) {
-                        LOG(kLOG_DEBUG3) << "   argument - "
-                                         << var.arguments()[z];
-                    }
+            for (int y = 0; y < op.outputs_size(); ++y) {
+                const framework::proto::OpDesc::Var &var = op.outputs(y);
+                LOG(kLOG_DEBUG2) << "  out parameter: " << var.parameter();
+                for (int z = 0; z < var.arguments().size(); ++z) {
+                    LOG(kLOG_DEBUG3) << "   argument - " << var.arguments()[z];
                }
+            }

-                for (int x = 0; x < op.attrs().size(); ++x) {
-                    const framework::proto::OpDesc_Attr attr = op.attrs()[x];
-                    //        std::cout << "  attr name: " << attr.name() <<
-                    //        std::endl;
-                    //        std::cout << "  attr type: " << attr.type() <<
-                    //        std::endl;
+            for (int x = 0; x < op.attrs().size(); ++x) {
+                const framework::proto::OpDesc_Attr attr = op.attrs()[x];
+                //        std::cout << "  attr name: " << attr.name() <<
+                //        std::endl;
+                //        std::cout << "  attr type: " << attr.type() <<
+                //        std::endl;

-                    switch (attr.type()) {
-                    case framework::proto::AttrType::BOOLEAN:
-                        //            std::cout << "   boolen: " << attr.b() <<
-                        //            std::endl;
-                        break;
-                    case framework::proto::AttrType::INT:
-                        //            std::cout << "   int: " << attr.i() <<
-                        //            std::endl;
-                        break;
-                    case framework::proto::AttrType::FLOAT:
-                    //            std::cout << "   float: " << attr.f() <<
+                switch (attr.type()) {
+                case framework::proto::AttrType::BOOLEAN:
+                    //            std::cout << "   boolen: " << attr.b() <<
                    //            std::endl;
-                    case framework::proto::AttrType::STRING:
-                    //            std::cout << "   string: " << attr.s() <<
+                    break;
+                case framework::proto::AttrType::INT:
+                    //            std::cout << "   int: " << attr.i() <<
                    //            std::endl;
-                    case framework::proto::AttrType::BOOLEANS:
-                        //                            std::vector<bool>
-                        //                            bools(attr.bools_size());
-                        for (int y = 0; y < attr.bools_size(); ++y) {
-                            //              std::cout << "   bool - " <<
-                            //              attr.bools(y) <<
-                            //              std::endl;
-                        }
-                    case framework::proto::AttrType::LONG:
-                    //            std::cout << "   long: " << attr.l() <<
-                    //            std::endl;
-                    case framework::proto::AttrType::FLOATS:
-                        for (int y = 0; y < attr.floats_size(); ++y) {
-                            //              std::cout << "   float - " << y <<
-                            //              ": " <<
-                            //              attr.floats(y)
-                            //                        << std::endl;
-                        }
-                    case framework::proto::AttrType::INTS:
-                        for (int y = 0; y < attr.ints_size(); ++y) {
-                            //              std::cout << "   int - " << y << ":
-                            //              " <<
-                            //              attr.ints(y)
-                            //                        << std::endl;
-                        }
-                    case framework::proto::AttrType::STRINGS:
-                        for (int y = 0; y < attr.strings_size(); ++y) {
-                            //              std::cout << "   string - " << y <<
-                            //              ": " <<
-                            //              attr.strings(y)
-                            //                        << std::endl;
-                        }
+                    break;
+                case framework::proto::AttrType::FLOAT:
+                //            std::cout << "   float: " << attr.f() <<
+                //            std::endl;
+                case framework::proto::AttrType::STRING:
+                //            std::cout << "   string: " << attr.s() <<
+                //            std::endl;
+                case framework::proto::AttrType::BOOLEANS:
+                    //                            std::vector<bool>
+                    //                            bools(attr.bools_size());
+                    for (int y = 0; y < attr.bools_size(); ++y) {
+                        //              std::cout << "   bool - " <<
+                        //              attr.bools(y) <<
+                        //              std::endl;
+                    }
+                case framework::proto::AttrType::LONG:
+                //            std::cout << "   long: " << attr.l() <<
+                //            std::endl;
+                case framework::proto::AttrType::FLOATS:
+                    for (int y = 0; y < attr.floats_size(); ++y) {
+                        //              std::cout << "   float - " << y <<
+                        //              ": " <<
+                        //              attr.floats(y)
+                        //                        << std::endl;
+                    }
+                case framework::proto::AttrType::INTS:
+                    for (int y = 0; y < attr.ints_size(); ++y) {
+                        //              std::cout << "   int - " << y << ":
+                        //              " <<
+                        //              attr.ints(y)
+                        //                        << std::endl;
+                    }
+                case framework::proto::AttrType::STRINGS:
+                    for (int y = 0; y < attr.strings_size(); ++y) {
+                        //              std::cout << "   string - " << y <<
+                        //              ": " <<
+                        //              attr.strings(y)
+                        //                        << std::endl;
                    }
                }
            }
+        }

-            for (int k = 0; k < block.vars().size(); ++k) {
-                framework::proto::VarDesc var = block.vars()[k];
-                if (var.type().type() ==
-                    framework::proto::VarType::LOD_TENSOR) {
-                    //        std::cout << " var name: " << var.name() <<
-                    //        std::endl;
-                    const framework::proto::VarType::TensorDesc &tensor_desc =
-                        var.type().lod_tensor().tensor();
-                    //        std::cout << "  in var tensor desc dims size "
-                    //                  << tensor_desc.dims().size() <<
-                    //                  std::endl;
-                    int memory_size = 1;
-                    for (int l = 0; l < tensor_desc.dims().size(); ++l) {
-                        //          std::cout << " var tensor desc dim " << l
-                        //                    << " value: " <<
-                        //                    tensor_desc.dims()[l] <<
-                        //                    std::endl;
-                    }
+        for (int k = 0; k < block.vars().size(); ++k) {
+            framework::proto::VarDesc var = block.vars()[k];
+            if (var.type().type() == framework::proto::VarType::LOD_TENSOR) {
+                //        std::cout << " var name: " << var.name() <<
+                //        std::endl;
+                const framework::proto::VarType::TensorDesc &tensor_desc =
+                    var.type().lod_tensor().tensor();
+                //        std::cout << "  in var tensor desc dims size "
+                //                  << tensor_desc.dims().size() <<
+                //                  std::endl;
+                int memory_size = 1;
+                for (int l = 0; l < tensor_desc.dims().size(); ++l) {
+                    //          std::cout << " var tensor desc dim " << l
+                    //                    << " value: " <<
+                    //                    tensor_desc.dims()[l] <<
+                    //                    std::endl;
                }
+            }

-                if (var.persistable() &&
-                    var.type().type() !=
-                        framework::proto::VarType::FEED_MINIBATCH &&
-                    var.type().type() !=
-                        framework::proto::VarType::FETCH_LIST) {
-                    //        std::cout << "  to load " << var.name() <<
-                    //        std::endl;
-                    std::string file_path = dirname + "/" + var.name();
-                    std::ifstream is(file_path);
-                    std::streampos pos =
-                        is.tellg(); //   save   current   position
-                    is.seekg(0, std::ios::end);
-                    //        std::cout << "  file length = " << is.tellg() <<
-                    //        std::endl;
-                    is.seekg(pos); //   restore   saved   position
-
-                    // 1. version
-                    uint32_t version;
-                    is.read(reinterpret_cast<char *>(&version),
-                            sizeof(version));
-                    //        std::cout << "   version: " << version <<
-                    //        std::endl;
-
-                    // 2 Lod information
-                    uint64_t lod_level;
-                    is.read(reinterpret_cast<char *>(&lod_level),
-                            sizeof(lod_level));
-                    //        std::cout << "   load level: " << lod_level <<
-                    //        std::endl;
-                    //        std::cout << "   lod info: " << std::endl;
-                    for (uint64_t i = 0; i < lod_level; ++i) {
-                        uint64_t size;
-                        is.read(reinterpret_cast<char *>(&size), sizeof(size));
-                        std::vector<size_t> tmp(size / sizeof(size_t));
-                        is.read(reinterpret_cast<char *>(tmp.data()),
-                                static_cast<std::streamsize>(size));
-                        for (int j = 0; j < tmp.size(); ++j) {
-                            //            std::cout << "    lod - " << tmp[j] <<
-                            //            std::endl;
-                        }
-                    }
-
-                    uint32_t tensor_version;
-                    is.read(reinterpret_cast<char *>(&version),
-                            sizeof(version));
-                    //        std::cout << "   tensor_version: " <<
-                    //        tensor_version <<
-                    //        std::endl;
-
-                    int32_t size;
+            if (var.persistable() &&
+                var.type().type() !=
+                    framework::proto::VarType::FEED_MINIBATCH &&
+                var.type().type() != framework::proto::VarType::FETCH_LIST) {
+                //        std::cout << "  to load " << var.name() <<
+                //        std::endl;
+                std::string file_path = dirname + "/" + var.name();
+                std::ifstream is(file_path);
+                std::streampos pos = is.tellg(); //   save   current   position
+                is.seekg(0, std::ios::end);
+                //        std::cout << "  file length = " << is.tellg() <<
+                //        std::endl;
+                is.seekg(pos); //   restore   saved   position
+
+                // 1. version
+                uint32_t version;
+                is.read(reinterpret_cast<char *>(&version), sizeof(version));
+                //        std::cout << "   version: " << version <<
+                //        std::endl;
+
+                // 2 Lod information
+                uint64_t lod_level;
+                is.read(reinterpret_cast<char *>(&lod_level),
+                        sizeof(lod_level));
+                //        std::cout << "   load level: " << lod_level <<
+                //        std::endl;
+                //        std::cout << "   lod info: " << std::endl;
+                for (uint64_t i = 0; i < lod_level; ++i) {
+                    uint64_t size;
                    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-                    //        std::cout << "   tensor desc size: " << size <<
-                    //        std::endl;
-                    std::unique_ptr<char[]> buf(new char[size]);
-                    is.read(reinterpret_cast<char *>(buf.get()), size);
-
-                    framework::proto::VarType::TensorDesc desc;
-                    desc.ParseFromArray(buf.get(), size);
-
-                    //        std::cout << "   desc dims size " <<
-                    //        desc.dims().size() <<
-                    //        std::endl;
-                    int memory_size = 1;
-                    for (int l = 0; l < desc.dims().size(); ++l) {
-                        //          std::cout << "    dim " << l << " value: "
-                        //          <<
-                        //          desc.dims()[l]
-                        //                    << std::endl;
-                        memory_size *= desc.dims()[l];
-                    }
-
-                    int type_size = 0;
-                    //        std::cout << "    desc pre type: ";
-                    switch (desc.data_type()) {
-                    case framework::proto::VarType::FP16:
-                        //            std::cout << "FP16" << std::endl;
-                        type_size = 2;
-                        break;
-                    case framework::proto::VarType::FP32:
-                        type_size = 4;
-                        //            std::cout << "FP32" << std::endl;
-                        break;
-                    case framework::proto::VarType::FP64:
-                        type_size = 8;
-                        //            std::cout << "FP64" << std::endl;
-                        break;
-                    case framework::proto::VarType::INT32:
-                        type_size = 4;
-                        //            std::cout << "INT32" << std::endl;
-                        break;
-                    case framework::proto::VarType::INT64:
-                        type_size = 8;
-                        //            std::cout << "INT64" << std::endl;
-                        break;
-                    case framework::proto::VarType::BOOL:
-                        type_size = 1;
-                        //            std::cout << "BOOL" << std::endl;
-                        break;
-                    default:
-                        break;
-                        //            std::cout << "    not support" <<
+                    std::vector<size_t> tmp(size / sizeof(size_t));
+                    is.read(reinterpret_cast<char *>(tmp.data()),
+                            static_cast<std::streamsize>(size));
+                    for (int j = 0; j < tmp.size(); ++j) {
+                        //            std::cout << "    lod - " << tmp[j] <<
                        //            std::endl;
                    }
+                }
+
+                uint32_t tensor_version;
+                is.read(reinterpret_cast<char *>(&version), sizeof(version));
+                //        std::cout << "   tensor_version: " <<
+                //        tensor_version <<
+                //        std::endl;
+
+                int32_t size;
+                is.read(reinterpret_cast<char *>(&size), sizeof(size));
+                //        std::cout << "   tensor desc size: " << size <<
+                //        std::endl;
+                std::unique_ptr<char[]> buf(new char[size]);
+                is.read(reinterpret_cast<char *>(buf.get()), size);
+
+                framework::proto::VarType::TensorDesc desc;
+                desc.ParseFromArray(buf.get(), size);
+
+                //        std::cout << "   desc dims size " <<
+                //        desc.dims().size() <<
+                //        std::endl;
+                int memory_size = 1;
+                for (int l = 0; l < desc.dims().size(); ++l) {
+                    //          std::cout << "    dim " << l << " value: "
+                    //          <<
+                    //          desc.dims()[l]
+                    //                    << std::endl;
+                    memory_size *= desc.dims()[l];
+                }

-                    //        std::cout << "    malloc size: " << memory_size *
-                    //        type_size
-                    //                  << std::endl;
-                    void *memory = malloc(memory_size * type_size);
-                    is.read(static_cast<char *>(memory),
-                            memory_size * type_size);
-                    //        std::cout << "    memory: " << memory <<
-                    //        std::endl;
-                    is.close();
-                } else {
-                    //        std::cout << "  *not load "
-                    //                  << " var : " << var.name() << std::endl;
+                int type_size = 0;
+                //        std::cout << "    desc pre type: ";
+                switch (desc.data_type()) {
+                case framework::proto::VarType::FP16:
+                    //            std::cout << "FP16" << std::endl;
+                    type_size = 2;
+                    break;
+                case framework::proto::VarType::FP32:
+                    type_size = 4;
+                    //            std::cout << "FP32" << std::endl;
+                    break;
+                case framework::proto::VarType::FP64:
+                    type_size = 8;
+                    //            std::cout << "FP64" << std::endl;
+                    break;
+                case framework::proto::VarType::INT32:
+                    type_size = 4;
+                    //            std::cout << "INT32" << std::endl;
+                    break;
+                case framework::proto::VarType::INT64:
+                    type_size = 8;
+                    //            std::cout << "INT64" << std::endl;
+                    break;
+                case framework::proto::VarType::BOOL:
+                    type_size = 1;
+                    //            std::cout << "BOOL" << std::endl;
+                    break;
+                default:
+                    break;
+                    //            std::cout << "    not support" <<
+                    //            std::endl;
                }
+
+                //        std::cout << "    malloc size: " << memory_size *
+                //        type_size
+                //                  << std::endl;
+                void *memory = malloc(memory_size * type_size);
+                is.read(static_cast<char *>(memory), memory_size * type_size);
+                //        std::cout << "    memory: " << memory <<
+                //        std::endl;
+                is.close();
+            } else {
+                //        std::cout << "  *not load "
+                //                  << " var : " << var.name() << std::endl;
            }
        }
+    }

 #endif
-        return program;
-    }
+    return program;
+}

-    template class Loader<CPU, Precision::FP32>;
+template class Loader<CPU, Precision::FP32>;

 } // namespace paddle_mobile
--- a/src/io.h
+++ b/src/io.h
@@ -27,14 +27,13 @@ SOFTWARE.

 namespace paddle_mobile {

-    template <typename Dtype, Precision P = Precision::FP32>
-    class Loader : PaddleMobileObject {
-      public:
-        const framework::Program<Dtype, P> Load(const std::string &dirname);
+template <typename Dtype, Precision P = Precision::FP32>
+class Loader : PaddleMobileObject {
+  public:
+    const framework::Program<Dtype, P> Load(const std::string &dirname);

-      private:
-        void LoadVar(framework::LoDTensor *tensor,
-                     const std::string &file_path);
-    };
+  private:
+    void LoadVar(framework::LoDTensor *tensor, const std::string &file_path);
+};

 } // namespace paddle_mobile
--- a/src/memory/t_malloc.cc
+++ b/src/memory/t_malloc.cc
@@ -22,30 +22,30 @@ SOFTWARE.
 #include <cstring>

 namespace paddle_mobile {
-    namespace memory {
-        const int MALLOC_ALIGN = 16;
+namespace memory {
+const int MALLOC_ALIGN = 16;

-        void Copy(void *dst, const void *src, size_t num) {
-            std::memcpy(dst, src, num);
-        };
+void Copy(void *dst, const void *src, size_t num) {
+    std::memcpy(dst, src, num);
+};

-        void *Alloc(size_t size) {
-            size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
-            char *p = static_cast<char *>(malloc(offset + size));
-            if (!p) {
-                return nullptr;
-            }
-            void *r = reinterpret_cast<void *>(
-                reinterpret_cast<size_t>(p + offset) & (~(MALLOC_ALIGN - 1)));
-            static_cast<void **>(r)[-1] = p;
-            return r;
-        }
+void *Alloc(size_t size) {
+    size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
+    char *p = static_cast<char *>(malloc(offset + size));
+    if (!p) {
+        return nullptr;
+    }
+    void *r = reinterpret_cast<void *>(reinterpret_cast<size_t>(p + offset) &
+                                       (~(MALLOC_ALIGN - 1)));
+    static_cast<void **>(r)[-1] = p;
+    return r;
+}

-        void Free(void *ptr) {
-            if (ptr) {
-                free(static_cast<void **>(ptr)[-1]);
-            }
-        }
+void Free(void *ptr) {
+    if (ptr) {
+        free(static_cast<void **>(ptr)[-1]);
+    }
+}

-    } // namespace memory
+} // namespace memory
 } // namespace paddle_mobile
--- a/src/memory/t_malloc.h
+++ b/src/memory/t_malloc.h
@@ -21,44 +21,44 @@ SOFTWARE.
 #include <type_traits>

 namespace paddle_mobile {
-    namespace memory {
+namespace memory {

-        void Copy(void *dst, const void *src, size_t num);
+void Copy(void *dst, const void *src, size_t num);

-        void *Alloc(size_t size);
+void *Alloc(size_t size);

-        void Free(void *ptr);
+void Free(void *ptr);

-        /**
-         * \brief   Free memory block in one place.
-         *
-         * \note    In some cases, custom deleter is used to
-         *          deallocate the memory automatically for
-         *          std::unique_ptr<T> in tensor.h.
-         *          static_cast
-         */
-        template <typename T> class PODDeleter {
-            static_assert(std::is_pod<T>::value, "T must be POD");
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *          static_cast
+ */
+template <typename T> class PODDeleter {
+    static_assert(std::is_pod<T>::value, "T must be POD");

-          public:
-            explicit PODDeleter(){};
+  public:
+    explicit PODDeleter(){};

-            void operator()(T *ptr) { Free(static_cast<void *>(ptr)); }
-        };
+    void operator()(T *ptr) { Free(static_cast<void *>(ptr)); }
+};

-        /**
-         * \brief   Free memory block in one place does not meet POD
-         *
-         * \note    In some cases, custom deleter is used to
-         *          deallocate the memory automatically for
-         *          std::unique_ptr<T> in tensor.h.
-         *          reinterpret_cast
-         */
-        template <typename T> class PlainDeleter {
-          public:
-            explicit PlainDeleter(){};
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *          reinterpret_cast
+ */
+template <typename T> class PlainDeleter {
+  public:
+    explicit PlainDeleter(){};

-            void operator()(T *ptr) { Free(reinterpret_cast<void *>(ptr)); }
-        };
-    } // namespace memory
+    void operator()(T *ptr) { Free(reinterpret_cast<void *>(ptr)); }
+};
+} // namespace memory
 } // namespace paddle_mobile
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -23,50 +23,50 @@ SOFTWARE.
 namespace paddle_mobile {
 namespace operators {

-int ConvOutputSize(int input_size, int filter_size, int dilation,
-                   int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
+int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
+                   int stride) {
+    const int dkernel = dilation * (filter_size - 1) + 1;
+    int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+    return output_size;
 }

-template<typename Dtype, typename T>
+template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
-  //  std::cout << " begin get dims: " << std::endl;
+    //  std::cout << " begin get dims: " << std::endl;

-  auto in_dims = param_.Input()->dims();
+    auto in_dims = param_.Input()->dims();

-  //  std::cout << " end get in dims: " << std::endl;
+    //  std::cout << " end get in dims: " << std::endl;

-  //  std::cout << " in_dims: " << in_dims << std::endl;
+    //  std::cout << " in_dims: " << in_dims << std::endl;

-  //  std::cout << " begin get Filter " << std::endl;
+    //  std::cout << " begin get Filter " << std::endl;

-  auto filter_dims = param_.Filter()->dims();
+    auto filter_dims = param_.Filter()->dims();

-  //  std::cout << " end get Filter " << std::endl;
+    //  std::cout << " end get Filter " << std::endl;

-  //  std::cout << " begin get Attrs " << std::endl;
+    //  std::cout << " begin get Attrs " << std::endl;

-  const std::vector<int> &strides = param_.Strides();
+    const std::vector<int> &strides = param_.Strides();

-  //  std::cout << " end get Attrs " << strides[0] << std::endl;
+    //  std::cout << " end get Attrs " << strides[0] << std::endl;

-  std::vector<int> paddings = param_.Paddings();
+    std::vector<int> paddings = param_.Paddings();

-  int groups = param_.Groups();
+    int groups = param_.Groups();

-  std::vector<int> dilations = param_.Dilations();
+    std::vector<int> dilations = param_.Dilations();

-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                       dilations[i], paddings[i], strides[i]));
-  }
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+    for (size_t i = 0; i < strides.size(); ++i) {
+        output_shape.push_back(ConvOutputSize(in_dims[i + 2],
+                                              filter_dims[i + 2], dilations[i],
+                                              paddings[i], strides[i]));
+    }

-  framework::DDim ddim = framework::make_ddim(output_shape);
-  param_.Output()->Resize(ddim);
+    framework::DDim ddim = framework::make_ddim(output_shape);
+    param_.Output()->Resize(ddim);
 }

 template class ConvOp<CPU, float>;

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -26,29 +26,28 @@ namespace operators {

 using namespace framework;

-template<typename DeviceType, typename T>
+template <typename DeviceType, typename T>
 class ConvOp : public framework::OperatorWithKernel<DeviceType> {
-public:
-  ConvOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs,
-         const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(
-      type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
-  void InferShape() const override;
-
-  void Run() const {
-    operators::ConvKernel<DeviceType, T, ConvParam> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables();
-  }
-
-private:
-  ConvParam param_;
+  public:
+    ConvOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+           std::shared_ptr<framework::Scope> scope)
+        : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs,
+                                                    attrs, scope),
+          param_(inputs, outputs, attrs, *scope) {}
+
+    using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+    void InferShape() const override;
+
+    void Run() const {
+        operators::ConvKernel<DeviceType, T, ConvParam> kernel;
+        kernel.Compute(param_);
+        this->ClearVariables();
+    }
+
+  private:
+    ConvParam param_;
 };

-} // operators
-} // paddle_mobile
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -21,11 +21,11 @@ SOFTWARE.
 namespace paddle_mobile {
 namespace operators {

-template<typename Dtype, typename T>
+template <typename Dtype, typename T>
 void ElementwiseAddOp<Dtype, T>::InferShape() const {
-  auto x_dim = param_.InputX()->dims();
-  param_.Out()->Resize(x_dim);
+    auto x_dim = param_.InputX()->dims();
+    param_.Out()->Resize(x_dim);
 }
 template class ElementwiseAddOp<CPU, float>;
-}
-}
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -25,31 +25,28 @@ namespace operators {

 using namespace framework;

-template<typename DeviceType, typename T>
-class ElementwiseAddOp
-    : public framework::OperatorWithKernel<DeviceType> {
-public:
-  ElementwiseAddOp(const std::string &type,
-                   const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap attrs,
-                   std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(
-      type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {}
+template <typename DeviceType, typename T>
+class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
+  public:
+    ElementwiseAddOp(const std::string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap attrs,
+                     std::shared_ptr<framework::Scope> scope)
+        : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs,
+                                                    attrs, scope),
+          param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
-    operators::ElementwiseAddKernel<DeviceType, T,
-                                    ElementwiseAddParam>
-        kernel;
-    kernel.Compute(param_);
-  }
+    void Run() const {
+        operators::ElementwiseAddKernel<DeviceType, T, ElementwiseAddParam>
+            kernel;
+        kernel.Compute(param_);
+    }

-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
-  void InferShape() const override;
+    using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+    void InferShape() const override;

-protected:
-  ElementwiseAddParam param_;
+  protected:
+    ElementwiseAddParam param_;
 };
-}
-}
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -22,140 +22,131 @@ namespace paddle_mobile {
 namespace operators {

 bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides,
-              const std::vector<int> &paddings,
+              const std::vector<int> &strides, const std::vector<int> &paddings,
              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true,
-      dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 =
-        filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+    bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+    for (size_t j = 0; j < strides.size(); ++j) {
+        filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+        strides_1 = strides_1 && (strides[j] == 1);
+        padding_0 = padding_0 && (paddings[j] == 0);
+        dilation_1 = dilation_1 && (dilations[j] == 1);
+    }
+    return !(filter_1 && strides_1 && padding_0 && dilation_1);
 }

-template<>
-void ConvKernel<CPU, float, ConvParam>::Compute(
-    const ConvParam &param) const {
-  LOG(kLOG_DEBUG) << param;
-
-  const Tensor *input = param.Input();
-
-  // The filter will be reshaped in the calculations,
-  // so here use an assignment operation,
-  // that avoids modifying the variable in the Scope.
-  Tensor filter = *param.Filter();
-
-  Tensor *output = param.Output();
-  //            output->mutable_data<T>(context.GetPlace());
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h,
-  // k_w}
-  std::vector<int64_t> filter_shape_vec(
-      framework::vectorize(filter.dims()));
-  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h,
-  // o_w}
-  std::vector<int64_t> output_shape_vec(
-      framework::vectorize(output->dims()));
-
-  // use col_shape in the im2col calculation
-  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h,
-  // k_w, o_d,
-  // o_h, o_w}
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  // use col_matrix_shape in the gemm calculation
-  // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w,
-  // o_d *
-  // o_h * o_w)
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  // col_matrix shares the same piece of data with col,
-  // but will be reshaped into a two-dimensional matrix shape
-  // to call the matrix multiplication interface.
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {
-      filter.dims()[0], filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  //            auto& dev_ctx = context.template
-  //            device_context<DeviceContext>();
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch =
-        output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice =
-          in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
+template <>
+void ConvKernel<CPU, float, ConvParam>::Compute(const ConvParam &param) const {
+    LOG(kLOG_DEBUG) << param;
+
+    const Tensor *input = param.Input();
+
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *param.Filter();
+
+    Tensor *output = param.Output();
+    //            output->mutable_data<T>(context.GetPlace());
+
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+
+    DLOG << " compute end get Attrs " << strides[0];
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h,
+    // k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h,
+    // o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h,
+    // k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+        col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w,
+    // o_d *
+    // o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+        col.mutable_data<float>(col_shape);
        col_matrix.ShareDataWith(col);
        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1],
-                                paddings[0], paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice =
-          out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice =
-          filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          float(1.0), &out_slice, float(0.0));
    }
-  }
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output->dims()[1],
+        output->numel() / (output->dims()[0] * output->dims()[1])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+    math::Vol2ColFunctor<CPU, float> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+    //            auto& dev_ctx = context.template
+    //            device_context<DeviceContext>();
+    for (int i = 0; i < batch_size; i++) {
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+        Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+        for (int g = 0; g < groups; g++) {
+            Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+            if (!is_expand) {
+                col.ShareDataWith(in_slice);
+                col_matrix.ShareDataWith(col);
+                col_matrix.Resize(col_matrix_shape);
+            } else if (data_dim == 2U) {
+                // im2col
+                im2col(in_slice, dilations, strides,
+                       std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                        paddings[1]},
+                       &col);
+            } else if (data_dim == 3U) {
+                // vol2col
+                vol2col(in_slice, dilations, strides, paddings, &col);
+            }
+
+            // gemm
+            Tensor out_slice =
+                out_batch.Slice(g * out_step, (g + 1) * out_step);
+            Tensor filter_slice =
+                filter.Slice(g * out_step, (g + 1) * out_step);
+            math::matmul<float>(filter_slice, false, col_matrix, false,
+                                float(1.0), &out_slice, float(0.0));
+        }
+    }
 }

 template class ConvKernel<CPU, float, ConvParam>;

--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -19,23 +19,23 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-template<typename T> struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
+template <typename T> struct AddFunctor {
+    inline T operator()(T a, T b) const { return a + b; }
 };

-template<>
+template <>
 void ElementwiseAddKernel<CPU, float, ElementwiseAddParam>::Compute(
    const ElementwiseAddParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  const int axis = param.Axis();
-  ElementwiseComputeEx<AddFunctor<float>, float>(
-      input_x, input_y, axis, AddFunctor<float>(), Out);
+    const Tensor *input_x = param.InputX();
+    const Tensor *input_y = param.InputY();
+    Tensor *Out = param.Out();
+    Out->mutable_data<float>();
+    const int axis = param.Axis();
+    ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                   AddFunctor<float>(), Out);
 }

 template class ElementwiseAddKernel<CPU, float, ElementwiseAddParam>;

 } // namespace operators
-} // namespace paddle
+} // namespace paddle_mobile
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -23,34 +23,32 @@ SOFTWARE.
 namespace paddle_mobile {
 namespace operators {

-template<>
-void
-MulKernel<CPU, float, MulParam>::Compute(const MulParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-      ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-      : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-      ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-      : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false,
-                      static_cast<float>(1), out,
-                      static_cast<float>(0));
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
+template <>
+void MulKernel<CPU, float, MulParam>::Compute(const MulParam &param) const {
+    const Tensor *input_x = param.InputX();
+    const Tensor *input_y = param.InputY();
+    Tensor *out = param.Out();
+    out->mutable_data<float>();
+    const Tensor x_matrix =
+        input_x->dims().size() > 2
+            ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+            : *input_x;
+    const Tensor y_matrix =
+        input_y->dims().size() > 2
+            ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+            : *input_y;
+    auto out_dim = out->dims();
+    if (out_dim.size() != 2) {
+        out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                        out, static_cast<float>(0));
+    if (out_dim.size() != 2) {
+        out->Resize(out_dim);
+    }
 }

 template class MulKernel<CPU, float, MulParam>;

 } // namespace operators
-} // namespace paddle
+} // namespace paddle_mobile
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -29,11 +29,10 @@ namespace operators {

 using namespace framework;

-template<typename DeviceType, typename T, typename P>
-class ConvKernel
-    : public framework::OpKernelBase<DeviceType, ConvParam> {
-public:
-  void Compute(const ConvParam &param) const;
+template <typename DeviceType, typename T, typename P>
+class ConvKernel : public framework::OpKernelBase<DeviceType, ConvParam> {
+  public:
+    void Compute(const ConvParam &param) const;
 };
-}
-}
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -26,11 +26,11 @@ namespace operators {

 using namespace framework;

-template<typename DeviceType, typename T, typename P>
+template <typename DeviceType, typename T, typename P>
 class ElementwiseAddKernel
    : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
-public:
-  void Compute(const ElementwiseAddParam &param) const;
+  public:
+    void Compute(const ElementwiseAddParam &param) const;
 };
-}
-}
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -25,4 +25,4 @@ namespace operators {
 //
 // template class ConvKernel<FPGA, float>;
 }
-}
+} // namespace paddle_mobile
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -26,10 +26,10 @@ namespace operators {

 using namespace framework;

-template<typename DeviceType, typename T, typename P>
+template <typename DeviceType, typename T, typename P>
 class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
-public:
-  void Compute(const MulParam &param) const;
+  public:
+    void Compute(const MulParam &param) const;
 };
-}
-}
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/math/elementwise_op_function.h
+++ b/src/operators/math/elementwise_op_function.h
@@ -34,178 +34,174 @@ namespace operators {
 inline void get_mid_dims(const framework::DDim &x_dims,
                         const framework::DDim &y_dims, const int axis,
                         int *pre, int *n, int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  // compute pre
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    assert(x_dims[i + axis] == y_dims[i]);
-    /// "Broadcast dimension mismatch.");
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
+    *pre = 1;
+    *n = 1;
+    *post = 1;
+    // compute pre
+    for (int i = 0; i < axis; ++i) {
+        (*pre) *= x_dims[i];
+    }
+
+    for (int i = 0; i < y_dims.size(); ++i) {
+        assert(x_dims[i + axis] == y_dims[i]);
+        /// "Broadcast dimension mismatch.");
+        (*n) *= y_dims[i];
+    }
+
+    for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+        (*post) *= x_dims[i];
+    }
 }

 /// remove dims tail 1. (4,20,1,1) -> (4,20)
 inline void trim_trailing_singular_dims(framework::DDim *dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1)
-      break;
-  }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
-  }
+    // Remove trailing dimensions of size 1 for y
+    auto actual_dims_size = dims->size();
+    for (; actual_dims_size != 0; --actual_dims_size) {
+        if ((*dims)[actual_dims_size - 1] != 1)
+            break;
+    }
+    if (actual_dims_size != dims->size()) {
+        auto actual_dims = framework::vectorize(*dims);
+        actual_dims.resize(actual_dims_size);
+        *dims = framework::make_ddim(actual_dims);
+    }
 }

-template<typename T> class RowwiseTransformIterator {
-public:
-  RowwiseTransformIterator(const T *ptr, int n)
-      : ptr_(ptr), i_(0), n_(n) {}
+template <typename T> class RowwiseTransformIterator {
+  public:
+    RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}

-  RowwiseTransformIterator<T> &operator++() {
-    ++i_;
-    if (UNLIKELY(i_ == n_)) {
-      i_ = 0;
+    RowwiseTransformIterator<T> &operator++() {
+        ++i_;
+        if (UNLIKELY(i_ == n_)) {
+            i_ = 0;
+        }
+        return *this;
    }
-    return *this;
-  }

-  bool operator==(const RowwiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
+    bool operator==(const RowwiseTransformIterator<T> &rhs) const {
+        return (ptr_ + i_) == &(*rhs);
+    }

-  bool operator!=(const RowwiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
+    bool operator!=(const RowwiseTransformIterator<T> &rhs) const {
+        return (ptr_ + i_) != &(*rhs);
+    }

-  const T &operator*() { return ptr_[i_]; }
+    const T &operator*() { return ptr_[i_]; }

-private:
-  const T *ptr_;
-  int i_;
-  int64_t n_;
+  private:
+    const T *ptr_;
+    int i_;
+    int64_t n_;
 };

 /// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last
 /// dimension
 /// in (4,20,2) is 2 ,
 /// (20,1) move 1 stride , to fill(add) 2 element with the same number.
-template<typename T> class MidWiseTransformIterator {
-public:
-  MidWiseTransformIterator(const T *ptr, int n, int post)
-      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
-
-  MidWiseTransformIterator<T> &operator++() {
-    ++j_;
-    if (UNLIKELY(j_ == post_)) {
-      ++i_;
-      j_ = 0;
-      if (UNLIKELY(i_ == n_)) {
-        i_ = 0;
-      }
-    }
-    return *this;
-  }
-
-  bool operator==(const MidWiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const MidWiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T &operator*() { return ptr_[i_]; }
-
-private:
-  const T *ptr_;
-  int64_t i_;
-  int64_t j_;
-  int64_t n_;
-  int64_t post_;
+template <typename T> class MidWiseTransformIterator {
+  public:
+    MidWiseTransformIterator(const T *ptr, int n, int post)
+        : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+    MidWiseTransformIterator<T> &operator++() {
+        ++j_;
+        if (UNLIKELY(j_ == post_)) {
+            ++i_;
+            j_ = 0;
+            if (UNLIKELY(i_ == n_)) {
+                i_ = 0;
+            }
+        }
+        return *this;
+    }
+
+    bool operator==(const MidWiseTransformIterator<T> &rhs) const {
+        return (ptr_ + i_) == &(*rhs);
+    }
+
+    bool operator!=(const MidWiseTransformIterator<T> &rhs) const {
+        return (ptr_ + i_) != &(*rhs);
+    }
+
+    const T &operator*() { return ptr_[i_]; }
+
+  private:
+    const T *ptr_;
+    int64_t i_;
+    int64_t j_;
+    int64_t n_;
+    int64_t post_;
 };

-template<typename Functor, typename T, typename OutType = T>
+template <typename Functor, typename T, typename OutType = T>
 class TransformFunctor {
-public:
-  TransformFunctor(const framework::Tensor *x,
-                   const framework::Tensor *y, framework::Tensor *z,
-                   Functor func)
-      : x_(x->data<T>()), y_(y->data<T>()),
-        z_(z->mutable_data<OutType>()), nx_(x->numel()), func_(func) {
-  }
-
-  inline void Run() const {
-    math::Transform trans;
-    // 同时执行func(x_, y_)传入z_。
-    trans(x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunRowWise(int n, int pre) const {
-    math::Transform trans;
-    trans(x_, x_ + nx_, RowwiseTransformIterator<T>(y_, n), z_,
-          func_);
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    math::Transform trans;
-    trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post),
-          z_, func_);
-  }
-
-private:
-  const T *x_;
-  const T *y_;
-  OutType *z_;
-  int64_t nx_;
-  Functor func_;
+  public:
+    TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
+                     framework::Tensor *z, Functor func)
+        : x_(x->data<T>()), y_(y->data<T>()), z_(z->mutable_data<OutType>()),
+          nx_(x->numel()), func_(func) {}
+
+    inline void Run() const {
+        math::Transform trans;
+        // 同时执行func(x_, y_)传入z_。
+        trans(x_, x_ + nx_, y_, z_, func_);
+    }
+
+    inline void RunRowWise(int n, int pre) const {
+        math::Transform trans;
+        trans(x_, x_ + nx_, RowwiseTransformIterator<T>(y_, n), z_, func_);
+    }
+
+    inline void RunMidWise(int n, int pre, int post) const {
+        math::Transform trans;
+        trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post), z_,
+              func_);
+    }
+
+  private:
+    const T *x_;
+    const T *y_;
+    OutType *z_;
+    int64_t nx_;
+    Functor func_;
 };

-template<typename Functor, typename T, typename OutType = T>
+template <typename Functor, typename T, typename OutType = T>
 void ElementwiseComputeEx(const framework::Tensor *x,
-                          const framework::Tensor *y, int axis,
-                          Functor func, framework::Tensor *z) {
-  TransformFunctor<Functor, T, OutType> functor(x, y, z, func);
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  // PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-  //                  "Rank of first input must >= rank of second
-  //                  input.");
-
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  /// axis = -1 represent the last dimension.
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  // PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-  //               "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
-  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
+                          const framework::Tensor *y, int axis, Functor func,
+                          framework::Tensor *z) {
+    TransformFunctor<Functor, T, OutType> functor(x, y, z, func);
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    // PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+    //                  "Rank of first input must >= rank of second
+    //                  input.");
+
+    if (x_dims == y_dims) {
+        functor.Run();
+        return;
+    }
+
+    /// axis = -1 represent the last dimension.
+    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+    // PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+    //               "Axis should be in range [0, x_dims)");
+    trim_trailing_singular_dims(&y_dims);
+    axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+    int pre, n, post;
+    get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+    if (post == 1) {
+        functor.RunRowWise(n, pre);
+        return;
+    } else {
+        functor.RunMidWise(n, pre, post);
+        return;
+    }
 }

 } // namespace operators
-} // namespace paddle
+} // namespace paddle_mobile
--- a/src/operators/math/im2col.cc
+++ b/src/operators/math/im2col.cc
@@ -25,76 +25,71 @@ namespace math {
 *   [input_channels, filter_height, filter_width, output_height,
 * output_width]
 */
-template<class T> class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
-public:
-  void operator()(const framework::Tensor &im,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding,
-                  framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
+template <class T> class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
+  public:
+    void operator()(const framework::Tensor &im,
+                    const std::vector<int> &dilation,
+                    const std::vector<int> &stride,
+                    const std::vector<int> &padding, framework::Tensor *col) {
+        //    PADDLE_ENFORCE(im.dims().size() == 3);
+        //    PADDLE_ENFORCE(col->dims().size() == 5);

-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
+        int im_channels = im.dims()[0];
+        int im_height = im.dims()[1];
+        int im_width = im.dims()[2];
+        int filter_height = col->dims()[1];
+        int filter_width = col->dims()[2];
+        int col_height = col->dims()[3];
+        int col_width = col->dims()[4];

-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
-    //    -
-    //                       ((dilation[0] * (filter_height - 1)
-    //                       + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
-    //    -
-    //                       ((dilation[1] * (filter_width - 1)
-    //                       + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
+        //    -
+        //                       ((dilation[0] * (filter_height - 1)
+        //                       + 1))) /
+        //                              stride[0] +
+        //                          1,
+        //                      col_height,
+        //                      "Output_height and
+        //                      padding(padding_up, padding_down)
+        //                      are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
+        //    -
+        //                       ((dilation[1] * (filter_width - 1)
+        //                       + 1))) /
+        //                              stride[1] +
+        //                          1,
+        //                      col_width,
+        //                      "Output_height and
+        //                      padding(padding_up, padding_down)
+        //                      are " "inconsistent.");

-    int channels_col =
-        im_channels * filter_height * filter_width;
+        int channels_col = im_channels * filter_height * filter_width;

-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] +
-            h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] +
-              w_offset * dilation[1];
-          int col_idx =
-              (c * col_height + h) * col_width + w;
-          int im_idx =
-              (im_row_idx + c_im * im_height) * im_width +
-                  im_col_idx;
+        const T *im_data = im.data<T>();
+        T *col_data = col->data<T>();
+        for (int c = 0; c < channels_col; ++c) {
+            int w_offset = c % filter_width;
+            int h_offset = (c / filter_width) % filter_height;
+            int c_im = c / (filter_width * filter_height);
+            for (int h = 0; h < col_height; ++h) {
+                int im_row_idx =
+                    h * stride[0] - padding[0] + h_offset * dilation[0];
+                for (int w = 0; w < col_width; ++w) {
+                    int im_col_idx =
+                        w * stride[1] - padding[1] + w_offset * dilation[1];
+                    int col_idx = (c * col_height + h) * col_width + w;
+                    int im_idx =
+                        (im_row_idx + c_im * im_height) * im_width + im_col_idx;

-          col_data[col_idx] =
-              (im_row_idx < 0 ||
-                  im_row_idx >= im_height ||
-                  im_col_idx < 0 || im_col_idx >= im_width)
-              ? static_cast<T>(0)
-              : im_data[im_idx];
+                    col_data[col_idx] =
+                        (im_row_idx < 0 || im_row_idx >= im_height ||
+                         im_col_idx < 0 || im_col_idx >= im_width)
+                            ? static_cast<T>(0)
+                            : im_data[im_idx];
+                }
+            }
        }
-      }
    }
-  }
 };

 /*
@@ -103,75 +98,68 @@ public:
 *   [input_channels, filter_height, filter_width, output_height,
 * output_width]
 */
-template<class T> class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
-public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding,
-                  framework::Tensor *im) {
-    //    PADDLE_ENFORCE(im->dims().size() == 3);
-    //    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
+template <class T> class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
+  public:
+    void operator()(const framework::Tensor &col,
+                    const std::vector<int> &dilation,
+                    const std::vector<int> &stride,
+                    const std::vector<int> &padding, framework::Tensor *im) {
+        //    PADDLE_ENFORCE(im->dims().size() == 3);
+        //    PADDLE_ENFORCE(col.dims().size() == 5);
+        int im_channels = im->dims()[0];
+        int im_height = im->dims()[1];
+        int im_width = im->dims()[2];
+        int filter_height = col.dims()[1];
+        int filter_width = col.dims()[2];
+        int col_height = col.dims()[3];
+        int col_width = col.dims()[4];

-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
-    //    -
-    //                       ((dilation[0] * (filter_height - 1)
-    //                       + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
-    //    -
-    //                       ((dilation[1] * (filter_width - 1)
-    //                       + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
+        //    -
+        //                       ((dilation[0] * (filter_height - 1)
+        //                       + 1))) /
+        //                              stride[0] +
+        //                          1,
+        //                      col_height,
+        //                      "Output_height and
+        //                      padding(padding_up, padding_down)
+        //                      are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
+        //    -
+        //                       ((dilation[1] * (filter_width - 1)
+        //                       + 1))) /
+        //                              stride[1] +
+        //                          1,
+        //                      col_width,
+        //                      "Output_height and
+        //                      padding(padding_up, padding_down)
+        //                      are " "inconsistent.");

-    int channels_col =
-        im_channels * filter_height * filter_width;
+        int channels_col = im_channels * filter_height * filter_width;

-    T *im_data = im->data<T>();
-    const T *col_data = col.data<T>();
+        T *im_data = im->data<T>();
+        const T *col_data = col.data<T>();

-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] +
-            h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] +
-              w_offset * dilation[1];
-          if ((im_row_idx) >= 0 &&
-              (im_row_idx) < im_height &&
-              (im_col_idx) >= 0 &&
-              (im_col_idx) < im_width) {
-            im_data[(im_row_idx + c_im * im_height) *
-                im_width +
-                im_col_idx] +=
-                col_data[(c * col_height + h) *
-                    col_width +
-                    w];
-          }
+        for (int c = 0; c < channels_col; ++c) {
+            int w_offset = c % filter_width;
+            int h_offset = (c / filter_width) % filter_height;
+            int c_im = c / (filter_width * filter_height);
+            for (int h = 0; h < col_height; ++h) {
+                int im_row_idx =
+                    h * stride[0] - padding[0] + h_offset * dilation[0];
+                for (int w = 0; w < col_width; ++w) {
+                    int im_col_idx =
+                        w * stride[1] - padding[1] + w_offset * dilation[1];
+                    if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+                        (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+                        im_data[(im_row_idx + c_im * im_height) * im_width +
+                                im_col_idx] +=
+                            col_data[(c * col_height + h) * col_width + w];
+                    }
+                }
+            }
        }
-      }
    }
-  }
 };

 template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
@@ -185,85 +173,75 @@ template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;
 *   [output_height, output_width, input_channels, filter_height,
 * filter_width]
 */
-template<class T> class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
-public:
-  void operator()(const framework::Tensor &im,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding,
-                  framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
+template <class T> class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
+  public:
+    void operator()(const framework::Tensor &im,
+                    const std::vector<int> &dilation,
+                    const std::vector<int> &stride,
+                    const std::vector<int> &padding, framework::Tensor *col) {
+        //    PADDLE_ENFORCE(im.dims().size() == 3);
+        //    PADDLE_ENFORCE(col->dims().size() == 5);
+        int im_channels = im.dims()[0];
+        int im_height = im.dims()[1];
+        int im_width = im.dims()[2];
+        int filter_height = col->dims()[3];
+        int filter_width = col->dims()[4];
+        int col_height = col->dims()[0];
+        int col_width = col->dims()[1];

-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] -
-    //        filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and
-    //        padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] -
-    //        filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left,
-    //        padding_right)
-    //        are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ(
+        //        (im_height + padding[0] + padding[2] -
+        //        filter_height) / stride[0]
+        //        + 1, col_height, "Output_height and
+        //        padding(padding_up,
+        //        padding_down) are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ(
+        //        (im_width + padding[1] + padding[3] -
+        //        filter_width) / stride[1] +
+        //        1, col_width, "col_width and padding(padding_left,
+        //        padding_right)
+        //        are " "inconsistent.");

-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
+        const T *im_data = im.data<T>();
+        T *col_data = col->data<T>();

-    for (int col_row_idx = 0; col_row_idx < col_height;
-         ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width;
-           ++col_col_idx) {
-        for (int channel = 0; channel < im_channels;
-             ++channel) {
-          for (int filter_row_idx = 0;
-               filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] +
-                    filter_row_idx - padding[0];
-            for (int filter_col_idx = 0;
-                 filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] +
-                      filter_col_idx - padding[1];
+        for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+            for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+                for (int channel = 0; channel < im_channels; ++channel) {
+                    for (int filter_row_idx = 0; filter_row_idx < filter_height;
+                         ++filter_row_idx) {
+                        int im_row_offset = col_row_idx * stride[0] +
+                                            filter_row_idx - padding[0];
+                        for (int filter_col_idx = 0;
+                             filter_col_idx < filter_width; ++filter_col_idx) {
+                            int im_col_offset = col_col_idx * stride[1] +
+                                                filter_col_idx - padding[1];

-              int col_offset =
-                  ((((col_row_idx) * col_width +
-                      col_col_idx) *
-                      im_channels +
-                      channel) *
-                      filter_height +
-                      filter_row_idx) *
-                      filter_width +
-                      filter_col_idx;
+                            int col_offset =
+                                ((((col_row_idx)*col_width + col_col_idx) *
+                                      im_channels +
+                                  channel) *
+                                     filter_height +
+                                 filter_row_idx) *
+                                    filter_width +
+                                filter_col_idx;

-              int im_offset = (channel * im_height +
-                  im_row_offset) *
-                  im_width +
-                  im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 ||
-                      im_row_offset >= im_height ||
-                      im_col_offset < 0 ||
-                      im_col_offset >= im_width)
-                  ? static_cast<T>(0)
-                  : im_data[im_offset];
+                            int im_offset =
+                                (channel * im_height + im_row_offset) *
+                                    im_width +
+                                im_col_offset;
+                            col_data[col_offset] =
+                                (im_row_offset < 0 ||
+                                 im_row_offset >= im_height ||
+                                 im_col_offset < 0 || im_col_offset >= im_width)
+                                    ? static_cast<T>(0)
+                                    : im_data[im_offset];
+                        }
+                    }
+                }
            }
-          }
        }
-      }
    }
-  }
 };

 /*
@@ -272,86 +250,75 @@ public:
 *   [output_height, output_width, input_channels, filter_height,
 * filter_width]
 */
-template<class T> class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
-public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding,
-                  framework::Tensor *im) {
-    //    PADDLE_ENFORCE(im->dims().size() == 3);
-    //    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
+template <class T> class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
+  public:
+    void operator()(const framework::Tensor &col,
+                    const std::vector<int> &dilation,
+                    const std::vector<int> &stride,
+                    const std::vector<int> &padding, framework::Tensor *im) {
+        //    PADDLE_ENFORCE(im->dims().size() == 3);
+        //    PADDLE_ENFORCE(col.dims().size() == 5);
+        int im_channels = im->dims()[0];
+        int im_height = im->dims()[1];
+        int im_width = im->dims()[2];
+        int filter_height = col.dims()[3];
+        int filter_width = col.dims()[4];
+        int col_height = col.dims()[0];
+        int col_width = col.dims()[1];

-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] -
-    //        filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and
-    //        padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] -
-    //        filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left,
-    //        padding_right)
-    //        are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ(
+        //        (im_height + padding[0] + padding[2] -
+        //        filter_height) / stride[0]
+        //        + 1, col_height, "Output_height and
+        //        padding(padding_up,
+        //        padding_down) are " "inconsistent.");
+        //    PADDLE_ENFORCE_EQ(
+        //        (im_width + padding[1] + padding[3] -
+        //        filter_width) / stride[1] +
+        //        1, col_width, "col_width and padding(padding_left,
+        //        padding_right)
+        //        are " "inconsistent.");

-    T *im_data = im->data<T>();
-    const T *col_data = col.data<T>();
+        T *im_data = im->data<T>();
+        const T *col_data = col.data<T>();

-    for (int col_row_idx = 0; col_row_idx < col_height;
-         ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width;
-           ++col_col_idx) {
-        for (int channel = 0; channel < im_channels;
-             ++channel) {
-          for (int filter_row_idx = 0;
-               filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] +
-                    filter_row_idx - padding[0];
-            for (int filter_col_idx = 0;
-                 filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] +
-                      filter_col_idx - padding[1];
+        for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+            for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+                for (int channel = 0; channel < im_channels; ++channel) {
+                    for (int filter_row_idx = 0; filter_row_idx < filter_height;
+                         ++filter_row_idx) {
+                        int im_row_offset = col_row_idx * stride[0] +
+                                            filter_row_idx - padding[0];
+                        for (int filter_col_idx = 0;
+                             filter_col_idx < filter_width; ++filter_col_idx) {
+                            int im_col_offset = col_col_idx * stride[1] +
+                                                filter_col_idx - padding[1];

-              int col_offset =
-                  (((col_row_idx * col_width +
-                      col_col_idx) *
-                      im_channels +
-                      channel) *
-                      filter_height +
-                      filter_row_idx) *
-                      filter_width +
-                      filter_col_idx;
+                            int col_offset =
+                                (((col_row_idx * col_width + col_col_idx) *
+                                      im_channels +
+                                  channel) *
+                                     filter_height +
+                                 filter_row_idx) *
+                                    filter_width +
+                                filter_col_idx;

-              if (im_row_offset >= 0 &&
-                  im_row_offset < im_height &&
-                  im_col_offset >= 0 &&
-                  im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height +
-                        im_row_offset) *
-                        im_width +
-                        im_col_offset;
-                im_data[im_offset] +=
-                    col_data[col_offset];
-              }
+                            if (im_row_offset >= 0 &&
+                                im_row_offset < im_height &&
+                                im_col_offset >= 0 &&
+                                im_col_offset < im_width) {
+                                int im_offset =
+                                    (channel * im_height + im_row_offset) *
+                                        im_width +
+                                    im_col_offset;
+                                im_data[im_offset] += col_data[col_offset];
+                            }
+                        }
+                    }
+                }
            }
-          }
        }
-      }
    }
-  }
 };

 template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
@@ -360,5 +327,5 @@ template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
 template class Col2ImFunctor<ColFormat::kOCF, CPU, double>;

 } // namespace math
-}     // namespace operators
+} // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/im2col.h
+++ b/src/operators/math/im2col.h
@@ -87,26 +87,24 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
 * equal to
 *       colShape.inputChannels.
 */
-template<ColFormat Format, typename DeviceType, typename T>
+template <ColFormat Format, typename DeviceType, typename T>
 class Im2ColFunctor {
-public:
-  void operator()(const framework::Tensor &im,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding,
-                  framework::Tensor *col);
+  public:
+    void operator()(const framework::Tensor &im,
+                    const std::vector<int> &dilation,
+                    const std::vector<int> &stride,
+                    const std::vector<int> &padding, framework::Tensor *col);
 };

-template<ColFormat Format, typename DeviceType, typename T>
+template <ColFormat Format, typename DeviceType, typename T>
 class Col2ImFunctor {
-public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding,
-                  framework::Tensor *im);
+  public:
+    void operator()(const framework::Tensor &col,
+                    const std::vector<int> &dilation,
+                    const std::vector<int> &stride,
+                    const std::vector<int> &padding, framework::Tensor *im);
 };

 } // namespace math
-}     // namespace operators
+} // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/math_function.cc
+++ b/src/operators/math/math_function.cc
@@ -18,122 +18,107 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-template<>
-void gemm<float>(const CBLAS_TRANSPOSE transA,
-                 const CBLAS_TRANSPOSE transB, const int M,
-                 const int N, const int K, const float alpha,
-                 const float *A, const float *B, const float beta,
-                 float *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+                 const int M, const int N, const int K, const float alpha,
+                 const float *A, const float *B, const float beta, float *C) {
+    int lda = (transA == CblasNoTrans) ? K : M;
+    int ldb = (transB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                beta, C, ldc);
 }

-template<>
-void gemm<double>(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB, const int M,
-                  const int N, const int K, const double alpha,
-                  const double *A, const double *B,
-                  const double beta, double *C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+                  const int M, const int N, const int K, const double alpha,
+                  const double *A, const double *B, const double beta,
+                  double *C) {
+    int lda = (transA == CblasNoTrans) ? K : M;
+    int ldb = (transB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                beta, C, ldc);
 }

-template<>
-void gemm<float>(const bool transA, const bool transB, const int M,
-                 const int N, const int K, const float alpha,
-                 const float *A, const int lda, const float *B,
-                 const int ldb, const float beta, float *C,
+template <>
+void gemm<float>(const bool transA, const bool transB, const int M, const int N,
+                 const int K, const float alpha, const float *A, const int lda,
+                 const float *B, const int ldb, const float beta, float *C,
                 const int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N,
-              K, alpha, A, lda, B, ldb, beta, C, ldc);
+    cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                lda, B, ldb, beta, C, ldc);
 }

-template<>
+template <>
 void gemm<double>(const bool transA, const bool transB, const int M,
-                  const int N, const int K, const double alpha,
-                  const double *A, const int lda, const double *B,
-                  const int ldb, const double beta, double *C,
-                  const int ldc) {
-  cblas_dgemm(CblasRowMajor,
-              transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N,
-              K, alpha, A, lda, B, ldb, beta, C, ldc);
+                  const int N, const int K, const double alpha, const double *A,
+                  const int lda, const double *B, const int ldb,
+                  const double beta, double *C, const int ldc) {
+    cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                lda, B, ldb, beta, C, ldc);
 }

-template<>
+template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
-                   const framework::Tensor &matrix_b, bool trans_b,
-                   float alpha, framework::Tensor *matrix_out,
-                   float beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
-  //  dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place())
-  //                     &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA =
-      (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB =
-      (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<float>(transA, transB, M, N, K, alpha,
-              matrix_a.data<float>(), matrix_b.data<float>(),
-              beta, matrix_out->data<float>());
+                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
+                   framework::Tensor *matrix_out, float beta) {
+    auto dim_a = matrix_a.dims();
+    auto dim_b = matrix_b.dims();
+    auto dim_out = matrix_out->dims();
+    //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+    //  dim_out.size() ==
+    //  2,
+    //                 "The input and output of matmul be matrix");
+    //
+    //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+    //                     platform::is_cpu_place(matrix_b.place())
+    //                     &&
+    //                     platform::is_cpu_place(matrix_out->place()),
+    //                 "Matrix must all be in CPUPlace");
+
+    int M = dim_out[0];
+    int N = dim_out[1];
+    int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+    gemm<float>(transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+                matrix_b.data<float>(), beta, matrix_out->data<float>());
 }

-template<>
+template <>
 void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out,
-                    double beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
-  //  dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place())
-  //                     &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA =
-      (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB =
-      (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<double>(transA, transB, M, N, K, alpha,
-               matrix_a.data<double>(), matrix_b.data<double>(),
-               beta, matrix_out->data<double>());
+                    double alpha, framework::Tensor *matrix_out, double beta) {
+    auto dim_a = matrix_a.dims();
+    auto dim_b = matrix_b.dims();
+    auto dim_out = matrix_out->dims();
+    //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
+    //  dim_out.size() ==
+    //  2,
+    //                 "The input and output of matmul be matrix");
+    //
+    //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+    //                     platform::is_cpu_place(matrix_b.place())
+    //                     &&
+    //                     platform::is_cpu_place(matrix_out->place()),
+    //                 "Matrix must all be in CPUPlace");
+
+    int M = dim_out[0];
+    int N = dim_out[1];
+    int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+    gemm<double>(transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+                 matrix_b.data<double>(), beta, matrix_out->data<double>());
 }

 } // namespace math
-}     // namespace operators
+} // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -22,23 +22,21 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-template<typename T>
-void gemm(const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB, const int M, const int N,
-          const int K, const T alpha, const T *A, const T *B,
-          const T beta, T *C);
-
-template<typename T>
-void gemm(const bool transA, const bool transB, const int M,
-          const int N, const int K, const T alpha, const T *A,
-          const int lda, const T *B, const int ldb, const T beta,
-          T *C, const int ldc);
+template <typename T>
+void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+          const int M, const int N, const int K, const T alpha, const T *A,
+          const T *B, const T beta, T *C);
+
+template <typename T>
+void gemm(const bool transA, const bool transB, const int M, const int N,
+          const int K, const T alpha, const T *A, const int lda, const T *B,
+          const int ldb, const T beta, T *C, const int ldc);

 // matrix multiply with continuous memory
-template<typename T>
+template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b,
-            T alpha, framework::Tensor *matrix_out, T beta);
+            const framework::Tensor &matrix_b, bool trans_b, T alpha,
+            framework::Tensor *matrix_out, T beta);
 } // namespace math
-}     // namespace operators
+} // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/transform.h
+++ b/src/operators/math/transform.h
@@ -37,21 +37,19 @@ namespace math {
 //       class, paddle::fluid::operators::RowwiseTRansformIterator.

 struct Transform {
-  template<typename InputIter, typename OutputIter,
-      typename UnaryOperation>
-  void operator()(InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template<typename InputIter1, typename InputIter2,
-      typename OutputIter, typename BinaryOperation>
-  void operator()(InputIter1 first1, InputIter1 last1,
-                  InputIter2 first2, OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
+    template <typename InputIter, typename OutputIter, typename UnaryOperation>
+    void operator()(InputIter first, InputIter last, OutputIter result,
+                    UnaryOperation op) {
+        std::transform(first, last, result, op);
+    }
+
+    template <typename InputIter1, typename InputIter2, typename OutputIter,
+              typename BinaryOperation>
+    void operator()(InputIter1 first1, InputIter1 last1, InputIter2 first2,
+                    OutputIter result, BinaryOperation op) {
+        std::transform(first1, last1, first2, result, op);
+    }
 };
-}
-} // namespace platform
-} // namespace paddle
+} // namespace math
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/math/vol2col.cc
+++ b/src/operators/math/vol2col.cc
@@ -25,97 +25,91 @@ using Tensor = paddle_mobile::framework::Tensor;
 *   [input_channels, filter_depth, filter_height, filter_width,
 *                    output_depth, output_height, output_width]
 */
-template<typename T> class Vol2ColFunctor<CPU, T> {
-public:
-  void operator()(const Tensor &vol,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings,
-                  Tensor *col) const {
-    //    PADDLE_ENFORCE(vol.dims().size() == 4);
-    //    PADDLE_ENFORCE(col->dims().size() == 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col = input_channels * filter_depth *
-        filter_height * filter_width;
-
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1)
-    //                       + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height -
-    //                       1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are
-    //                      "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1)
-    //                       + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
-
-    const T *vol_data = vol.data<T>();
-    T *col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset =
-          (c / filter_width / filter_height) % filter_depth;
-      int c_in =
-          c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] +
-            d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] +
-              h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] +
-                w_offset * dilations[2];
-
-            int col_idx = ((c * output_depth + d) *
-                output_height +
-                h) *
-                output_width +
-                w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) *
-                    input_height +
-                    h_pad) *
-                    input_width +
-                    w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height ||
-                    w_pad < 0 || w_pad >= input_width ||
-                    d_pad < 0 || d_pad >= input_depth)
-                ? static_cast<T>(0)
-                : vol_data[vol_idx];
-          }
+template <typename T> class Vol2ColFunctor<CPU, T> {
+  public:
+    void operator()(const Tensor &vol, const std::vector<int> &dilations,
+                    const std::vector<int> &strides,
+                    const std::vector<int> &paddings, Tensor *col) const {
+        //    PADDLE_ENFORCE(vol.dims().size() == 4);
+        //    PADDLE_ENFORCE(col->dims().size() == 7);
+
+        int input_channels = vol.dims()[0];
+        int input_depth = vol.dims()[1];
+        int input_height = vol.dims()[2];
+        int input_width = vol.dims()[3];
+        int filter_depth = col->dims()[1];
+        int filter_height = col->dims()[2];
+        int filter_width = col->dims()[3];
+        int output_depth = col->dims()[4];
+        int output_height = col->dims()[5];
+        int output_width = col->dims()[6];
+        int channels_col =
+            input_channels * filter_depth * filter_height * filter_width;
+
+        //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+        //                       ((dilations[0] * (filter_depth - 1)
+        //                       + 1))) /
+        //                              strides[0] +
+        //                          1,
+        //                      output_depth,
+        //                      "input_depth and output_depth are "
+        //                      "mismatching.");
+        //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+        //                       ((dilations[1] * (filter_height -
+        //                       1) + 1))) /
+        //                              strides[1] +
+        //                          1,
+        //                      output_height,
+        //                      "input_height and output_height are
+        //                      "
+        //                      "mismatching.");
+        //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+        //                       ((dilations[2] * (filter_width - 1)
+        //                       + 1))) /
+        //                              strides[2] +
+        //                          1,
+        //                      output_width,
+        //                      "input_width and output_width are "
+        //                      "mismatching.");
+
+        const T *vol_data = vol.data<T>();
+        T *col_data = col->data<T>();
+
+        for (int c = 0; c < channels_col; ++c) {
+            int w_offset = c % filter_width;
+            int h_offset = (c / filter_width) % filter_height;
+            int d_offset = (c / filter_width / filter_height) % filter_depth;
+            int c_in = c / filter_width / filter_height / filter_depth;
+            for (int d = 0; d < output_depth; ++d) {
+                int d_pad =
+                    d * strides[0] - paddings[0] + d_offset * dilations[0];
+                for (int h = 0; h < output_height; ++h) {
+                    int h_pad =
+                        h * strides[1] - paddings[1] + h_offset * dilations[1];
+                    for (int w = 0; w < output_width; ++w) {
+                        int w_pad = w * strides[2] - paddings[2] +
+                                    w_offset * dilations[2];
+
+                        int col_idx =
+                            ((c * output_depth + d) * output_height + h) *
+                                output_width +
+                            w;
+                        int vol_idx =
+                            ((c_in * input_depth + d_pad) * input_height +
+                             h_pad) *
+                                input_width +
+                            w_pad;
+                        col_data[col_idx] =
+                            (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                             w_pad >= input_width || d_pad < 0 ||
+                             d_pad >= input_depth)
+                                ? static_cast<T>(0)
+                                : vol_data[vol_idx];
+                    }
+                }
+            }
        }
-      }
    }
-  }
 };

 /*
@@ -124,96 +118,90 @@ public:
 *   [input_channels, filter_depth, filter_height, filter_width,
 *                    output_depth, output_height, output_width]
 */
-template<typename T> class Col2VolFunctor<CPU, T> {
-public:
-  void operator()(const Tensor &col,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings,
-                  Tensor *vol) const {
-    //    PADDLE_ENFORCE(vol->dims().size() == 4);
-    //    PADDLE_ENFORCE(col.dims().size() == 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col = input_channels * filter_depth *
-        filter_height * filter_width;
-
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1)
-    //                       + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height -
-    //                       1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are
-    //                      "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1)
-    //                       + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
-    T *vol_data = vol->data<T>();
-    const T *col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset =
-          (c / filter_width / filter_height) % filter_depth;
-      int cIm =
-          c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] +
-            d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] +
-              h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] +
-                w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height &&
-                w_pad >= 0 && w_pad < input_width &&
-                d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) *
-                      input_height +
-                      h_pad) *
-                      input_width +
-                      w_pad;
-
-              int col_idx = ((c * output_depth + d) *
-                  output_height +
-                  h) *
-                  output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
+template <typename T> class Col2VolFunctor<CPU, T> {
+  public:
+    void operator()(const Tensor &col, const std::vector<int> &dilations,
+                    const std::vector<int> &strides,
+                    const std::vector<int> &paddings, Tensor *vol) const {
+        //    PADDLE_ENFORCE(vol->dims().size() == 4);
+        //    PADDLE_ENFORCE(col.dims().size() == 7);
+
+        int input_channels = vol->dims()[0];
+        int input_depth = vol->dims()[1];
+        int input_height = vol->dims()[2];
+        int input_width = vol->dims()[3];
+        int filter_depth = col.dims()[1];
+        int filter_height = col.dims()[2];
+        int filter_width = col.dims()[3];
+        int output_depth = col.dims()[4];
+        int output_height = col.dims()[5];
+        int output_width = col.dims()[6];
+        int channels_col =
+            input_channels * filter_depth * filter_height * filter_width;
+
+        //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+        //                       ((dilations[0] * (filter_depth - 1)
+        //                       + 1))) /
+        //                              strides[0] +
+        //                          1,
+        //                      output_depth,
+        //                      "input_depth and output_depth are "
+        //                      "mismatching.");
+        //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+        //                       ((dilations[1] * (filter_height -
+        //                       1) + 1))) /
+        //                              strides[1] +
+        //                          1,
+        //                      output_height,
+        //                      "input_height and output_height are
+        //                      "
+        //                      "mismatching.");
+        //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+        //                       ((dilations[2] * (filter_width - 1)
+        //                       + 1))) /
+        //                              strides[2] +
+        //                          1,
+        //                      output_width,
+        //                      "input_width and output_width are "
+        //                      "mismatching.");
+        T *vol_data = vol->data<T>();
+        const T *col_data = col.data<T>();
+
+        for (int c = 0; c < channels_col; ++c) {
+            int w_offset = c % filter_width;
+            int h_offset = (c / filter_width) % filter_height;
+            int d_offset = (c / filter_width / filter_height) % filter_depth;
+            int cIm = c / filter_width / filter_height / filter_depth;
+            for (int d = 0; d < output_depth; ++d) {
+                int d_pad =
+                    d * strides[0] - paddings[0] + d_offset * dilations[0];
+                for (int h = 0; h < output_height; ++h) {
+                    int h_pad =
+                        h * strides[1] - paddings[1] + h_offset * dilations[1];
+                    for (int w = 0; w < output_width; ++w) {
+                        int w_pad = w * strides[2] - paddings[2] +
+                                    w_offset * dilations[2];
+
+                        if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                            w_pad < input_width && d_pad >= 0 &&
+                            d_pad < input_depth) {
+                            int vol_idx =
+                                ((cIm * input_depth + d_pad) * input_height +
+                                 h_pad) *
+                                    input_width +
+                                w_pad;
+
+                            int col_idx =
+                                ((c * output_depth + d) * output_height + h) *
+                                    output_width +
+                                w;
+                            vol_data[vol_idx] += col_data[col_idx];
+                        }
+                    }
+                }
            }
-          }
        }
-      }
    }
-  }
 };

 template class Vol2ColFunctor<CPU, float>;
@@ -222,5 +210,5 @@ template class Col2VolFunctor<CPU, float>;
 template class Col2VolFunctor<CPU, double>;

 } // namespace math
-}     // namespace operators
+} // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/math/vol2col.h
+++ b/src/operators/math/vol2col.h
@@ -72,24 +72,20 @@ namespace math {
 */
 using Tensor = paddle_mobile::framework::Tensor;

-template<typename DeviceType, typename T> class Vol2ColFunctor {
-public:
-  void operator()(const Tensor &vol,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings,
-                  Tensor *col) const;
+template <typename DeviceType, typename T> class Vol2ColFunctor {
+  public:
+    void operator()(const Tensor &vol, const std::vector<int> &dilations,
+                    const std::vector<int> &strides,
+                    const std::vector<int> &paddings, Tensor *col) const;
 };

-template<typename DeviceType, typename T> class Col2VolFunctor {
-public:
-  void operator()(const Tensor &col,
-                  const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings,
-                  Tensor *vol) const;
+template <typename DeviceType, typename T> class Col2VolFunctor {
+  public:
+    void operator()(const Tensor &col, const std::vector<int> &dilations,
+                    const std::vector<int> &strides,
+                    const std::vector<int> &paddings, Tensor *vol) const;
 };

 } // namespace math
-}     // namespace operators
+} // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -21,37 +21,36 @@ SOFTWARE.
 namespace paddle_mobile {
 namespace operators {

-template<typename Dtype, typename T>
-void MulOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
-  auto y_dims = param_.InputY()->dims();
-  int x_num_col_dims = param_.XNumColDims();
-  int y_num_col_dims = param_.YNumColDims();
+template <typename Dtype, typename T> void MulOp<Dtype, T>::InferShape() const {
+    auto x_dims = param_.InputX()->dims();
+    auto y_dims = param_.InputY()->dims();
+    int x_num_col_dims = param_.XNumColDims();
+    int y_num_col_dims = param_.YNumColDims();

-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
+    assert(x_dims.size() > x_num_col_dims);
+    assert(y_dims.size() > y_num_col_dims);

-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+    /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
+    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);

-  assert(x_mat_dims[1] == y_mat_dims[0]);
+    assert(x_mat_dims[1] == y_mat_dims[0]);

-  std::vector<int64_t> output_dims;
-  output_dims.reserve(static_cast<size_t>(
-                          x_num_col_dims + y_dims.size() - y_num_col_dims));
+    std::vector<int64_t> output_dims;
+    output_dims.reserve(
+        static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));

-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
+    for (int i = 0; i < x_num_col_dims; ++i) {
+        output_dims.push_back(x_dims[i]);
+    }

-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
+    for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+        output_dims.push_back(y_dims[i]);
+    }

-  framework::DDim ddim = framework::make_ddim(output_dims);
-  param_.Out()->Resize(ddim);
+    framework::DDim ddim = framework::make_ddim(output_dims);
+    param_.Out()->Resize(ddim);
 }
 template class MulOp<CPU, float>;
-}
-}
+} // namespace operators
+} // namespace paddle_mobile
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -25,28 +25,27 @@ namespace operators {

 using namespace framework;

-template<typename DeviceType, typename T>
+template <typename DeviceType, typename T>
 class MulOp : public framework::OperatorWithKernel<DeviceType> {
-public:
-  MulOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs,
-        const framework::AttributeMap attrs,
-        std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(
-      type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void Run() const {
-    operators::MulKernel<DeviceType, T, MulParam> kernel;
-    kernel.Compute(param_);
-  }
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
-  void InferShape() const override;
-
-protected:
-  MulParam param_;
+  public:
+    MulOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap attrs,
+          std::shared_ptr<framework::Scope> scope)
+        : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs,
+                                                    attrs, scope),
+          param_(inputs, outputs, attrs, *scope) {}
+
+    void Run() const {
+        operators::MulKernel<DeviceType, T, MulParam> kernel;
+        kernel.Compute(param_);
+    }
+
+    using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+    void InferShape() const override;
+
+  protected:
+    MulParam param_;
 };

 } // namespace operators
-} // namespace paddle
+} // namespace paddle_mobile
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
@@ -21,25 +21,25 @@ SOFTWARE.
 namespace paddle_mobile {
 namespace operators {
 Print &operator<<(Print &printer, const ConvParam &conv_param) {
-  printer << "parameter of conv: "
-          << "\n";
-  printer << "  stride: "
-          << " (" << conv_param.Strides()[0]
-          << conv_param.Strides()[1] << ") "
-          << "\n";
-  printer << "  paddings: "
-          << " (" << conv_param.Paddings()[0]
-          << conv_param.Paddings()[1] << ") "
-          << "\n";
-  printer << "  dilations: "
-          << " (" << conv_param.Dilations()[0]
-          << conv_param.Dilations()[1] << ") "
-          << "\n";
-  printer << "  groups: " << conv_param.Groups() << "\n";
-  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
-  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
-  printer << "  output dims: " << conv_param.Output()->dims();
-  return printer;
+    printer << "parameter of conv: "
+            << "\n";
+    printer << "  stride: "
+            << " (" << conv_param.Strides()[0] << conv_param.Strides()[1]
+            << ") "
+            << "\n";
+    printer << "  paddings: "
+            << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
+            << ") "
+            << "\n";
+    printer << "  dilations: "
+            << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
+            << ") "
+            << "\n";
+    printer << "  groups: " << conv_param.Groups() << "\n";
+    printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
+    printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
+    printer << "  output dims: " << conv_param.Output()->dims();
+    return printer;
 }
 } // namespace operators
 } // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -31,205 +31,195 @@ namespace operators {
 using namespace framework;

 class OpParam : PaddleMobileObject {
-public:
-protected:
-  template<typename T>
-  static T *InputFrom(const VariableNameMap &inputs,
-                      const Scope &scope) {
-    return GetVarValue<T>("Input", inputs, scope);
-  }
-
-  template<typename T>
-  static T *InputXFrom(const VariableNameMap &inputs,
-                       const Scope &scope) {
-    return GetVarValue<T>("X", inputs, scope);
-  }
-
-  template<typename T>
-  static T *InputYFrom(const VariableNameMap &inputs,
-                       const Scope &scope) {
-    return GetVarValue<T>("Y", inputs, scope);
-  }
-
-  template<typename T>
-  static std::vector<T *>
-  InputMultiFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetMultiVarValue<T>("Input", inputs, scope);
-  }
-
-  template<typename T>
-  static T *OutputFrom(const VariableNameMap &outputs,
-                       const Scope &scope) {
-    return GetVarValue<T>("Output", outputs, scope);
-  }
-
-  template<typename T>
-  static T *OutFrom(const VariableNameMap &outputs,
-                    const Scope &scope) {
-    return GetVarValue<T>("Out", outputs, scope);
-  }
-
-  template<typename T>
-  static T *FilterFrom(const VariableNameMap &inputs,
-                       const Scope &scope) {
-    return GetVarValue<T>("Filter", inputs, scope);
-  }
-
-  template<typename T>
-  static const T GetAttr(std::string key, const AttributeMap &map) {
-    return ((Attribute) map.at(key)).Get<T>();
-  }
-
-  template<typename T>
-  static T *GetVarValue(std::string key,
-                        const VariableNameMap &var_map,
-                        const Scope &scope) {
-    auto var_vec = var_map.at(key);
-    if (var_vec.size()) {
-      //      std::cout << " get var value -- " << var_vec[0] <<
-      //      std::endl;
-      auto var = scope.FindVar(var_vec[0]);
-      return var->GetMutable<T>();
-    } else {
-      return nullptr;
+  public:
+  protected:
+    template <typename T>
+    static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
+        return GetVarValue<T>("Input", inputs, scope);
    }
-  }
-
-  template<typename T>
-  static std::vector<T *>
-  GetMultiVarValue(std::string key, const VariableNameMap &var_map,
-                   const Scope &scope) {
-    auto var_vecs = var_map.at(key);
-    assert(var_vecs.size() > 1);
-    std::vector<T *> var_res;
-    for (auto &var_vec : var_vecs) {
-      auto var = scope.FindVar(var_vec);
-      var_res.push_back(var->GetMutable<T>());
+
+    template <typename T>
+    static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) {
+        return GetVarValue<T>("X", inputs, scope);
+    }
+
+    template <typename T>
+    static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) {
+        return GetVarValue<T>("Y", inputs, scope);
+    }
+
+    template <typename T>
+    static std::vector<T *> InputMultiFrom(const VariableNameMap &inputs,
+                                           const Scope &scope) {
+        return GetMultiVarValue<T>("Input", inputs, scope);
+    }
+
+    template <typename T>
+    static T *OutputFrom(const VariableNameMap &outputs, const Scope &scope) {
+        return GetVarValue<T>("Output", outputs, scope);
+    }
+
+    template <typename T>
+    static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
+        return GetVarValue<T>("Out", outputs, scope);
+    }
+
+    template <typename T>
+    static T *FilterFrom(const VariableNameMap &inputs, const Scope &scope) {
+        return GetVarValue<T>("Filter", inputs, scope);
+    }
+
+    template <typename T>
+    static const T GetAttr(std::string key, const AttributeMap &map) {
+        return ((Attribute)map.at(key)).Get<T>();
+    }
+
+    template <typename T>
+    static T *GetVarValue(std::string key, const VariableNameMap &var_map,
+                          const Scope &scope) {
+        auto var_vec = var_map.at(key);
+        if (var_vec.size()) {
+            //      std::cout << " get var value -- " << var_vec[0] <<
+            //      std::endl;
+            auto var = scope.FindVar(var_vec[0]);
+            return var->GetMutable<T>();
+        } else {
+            return nullptr;
+        }
+    }
+
+    template <typename T>
+    static std::vector<T *> GetMultiVarValue(std::string key,
+                                             const VariableNameMap &var_map,
+                                             const Scope &scope) {
+        auto var_vecs = var_map.at(key);
+        assert(var_vecs.size() > 1);
+        std::vector<T *> var_res;
+        for (auto &var_vec : var_vecs) {
+            auto var = scope.FindVar(var_vec);
+            var_res.push_back(var->GetMutable<T>());
+        }
+        return var_res;
    }
-    return var_res;
-  }
 };

 class ConvParam : OpParam {
-public:
-  ConvParam(const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
-    filter_ = FilterFrom<framework::LoDTensor>(inputs, scope);
-    input_ = InputFrom<framework::Tensor>(inputs, scope);
-    output_ = OutputFrom<framework::Tensor>(outputs, scope);
-    strides_ = GetAttr<std::vector<int>>("strides", attrs);
-    paddings_ = GetAttr<std::vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<std::vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-  }
-
-  const Tensor *Input() const { return input_; }
-
-  const LoDTensor *Filter() const { return filter_; }
-
-  Tensor *Output() const { return output_; }
-
-  const std::vector<int> &Strides() const { return strides_; }
-
-  const std::vector<int> &Paddings() const { return paddings_; }
-
-  const std::vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
-private:
-  Tensor *input_;
-  Tensor *output_;
-  LoDTensor *filter_;
-  std::vector<int> strides_;
-  std::vector<int> paddings_;
-  std::vector<int> dilations_;
-  int groups;
+  public:
+    ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const framework::AttributeMap &attrs,
+              const framework::Scope &scope) {
+        filter_ = FilterFrom<framework::LoDTensor>(inputs, scope);
+        input_ = InputFrom<framework::Tensor>(inputs, scope);
+        output_ = OutputFrom<framework::Tensor>(outputs, scope);
+        strides_ = GetAttr<std::vector<int>>("strides", attrs);
+        paddings_ = GetAttr<std::vector<int>>("paddings", attrs);
+        dilations_ = GetAttr<std::vector<int>>("dilations", attrs);
+        groups = GetAttr<int>("groups", attrs);
+    }
+
+    const Tensor *Input() const { return input_; }
+
+    const LoDTensor *Filter() const { return filter_; }
+
+    Tensor *Output() const { return output_; }
+
+    const std::vector<int> &Strides() const { return strides_; }
+
+    const std::vector<int> &Paddings() const { return paddings_; }
+
+    const std::vector<int> &Dilations() const { return dilations_; }
+
+    const int &Groups() const { return groups; }
+
+  private:
+    Tensor *input_;
+    Tensor *output_;
+    LoDTensor *filter_;
+    std::vector<int> strides_;
+    std::vector<int> paddings_;
+    std::vector<int> dilations_;
+    int groups;
 };

 Print &operator<<(Print &printer, const ConvParam &conv_param);

 class ElementwiseAddParam : OpParam {
-public:
-  ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::Tensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::Tensor>(inputs, scope);
-    out_ = OutFrom<framework::Tensor>(outputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const Tensor *InputX() const { return input_x_; }
-
-  const Tensor *InputY() const { return input_y_; }
-
-  Tensor *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
-private:
-  Tensor *input_x_;
-  Tensor *input_y_;
-  Tensor *out_;
-  int axis_;
+  public:
+    ElementwiseAddParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        const framework::Scope &scope) {
+        input_x_ = InputXFrom<framework::Tensor>(inputs, scope);
+        input_y_ = InputYFrom<framework::Tensor>(inputs, scope);
+        out_ = OutFrom<framework::Tensor>(outputs, scope);
+        axis_ = GetAttr<int>("axis", attrs);
+    }
+
+    const Tensor *InputX() const { return input_x_; }
+
+    const Tensor *InputY() const { return input_y_; }
+
+    Tensor *Out() const { return out_; }
+
+    const int &Axis() const { return axis_; }
+
+  private:
+    Tensor *input_x_;
+    Tensor *input_y_;
+    Tensor *out_;
+    int axis_;
 };

 class MulParam : OpParam {
-public:
-  MulParam(const VariableNameMap &inputs,
-           const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::Tensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::Tensor>(inputs, scope);
-    out_ = OutFrom<framework::Tensor>(outputs, scope);
-    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
-    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
-  }
-
-  const Tensor *InputX() const { return input_x_; }
-
-  const Tensor *InputY() const { return input_y_; }
-
-  Tensor *Out() const { return out_; }
-
-  const int &XNumColDims() const { return x_num_col_dims_; }
-
-  const int &YNumColDims() const { return y_num_col_dims_; }
-
-private:
-  Tensor *input_x_;
-  Tensor *input_y_;
-  Tensor *out_;
-  int x_num_col_dims_;
-  int y_num_col_dims_;
+  public:
+    MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             const framework::Scope &scope) {
+        input_x_ = InputXFrom<framework::Tensor>(inputs, scope);
+        input_y_ = InputYFrom<framework::Tensor>(inputs, scope);
+        out_ = OutFrom<framework::Tensor>(outputs, scope);
+        x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
+        y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
+    }
+
+    const Tensor *InputX() const { return input_x_; }
+
+    const Tensor *InputY() const { return input_y_; }
+
+    Tensor *Out() const { return out_; }
+
+    const int &XNumColDims() const { return x_num_col_dims_; }
+
+    const int &YNumColDims() const { return y_num_col_dims_; }
+
+  private:
+    Tensor *input_x_;
+    Tensor *input_y_;
+    Tensor *out_;
+    int x_num_col_dims_;
+    int y_num_col_dims_;
 };

 class ConcatParam : public OpParam {
-public:
-  ConcatParam(const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              const framework::Scope &scope) {
-    inputs_ = InputMultiFrom<framework::Tensor>(inputs, scope);
-    out_ = OutFrom<framework::Tensor>(outputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
+  public:
+    ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const framework::AttributeMap &attrs,
+                const framework::Scope &scope) {
+        inputs_ = InputMultiFrom<framework::Tensor>(inputs, scope);
+        out_ = OutFrom<framework::Tensor>(outputs, scope);
+        axis_ = GetAttr<int>("axis", attrs);
+    }

-  std::vector<Tensor *> Inputs() const { return inputs_; }
+    std::vector<Tensor *> Inputs() const { return inputs_; }

-  Tensor *Out() const { return out_; }
+    Tensor *Out() const { return out_; }

-  const int &Axis() const { return axis_; }
+    const int &Axis() const { return axis_; }

-private:
-  std::vector<Tensor *> inputs_;
-  Tensor *out_;
-  int axis_;
+  private:
+    std::vector<Tensor *> inputs_;
+    Tensor *out_;
+    int axis_;
 };

 } // namespace operators

--- a/test/common/test_log.cpp
+++ b/test/common/test_log.cpp
@@ -22,9 +22,10 @@ int main() {

    DLOGF("DASJFDAFJ%d -- %f", 12345, 344.234);

-    LOGF( paddle_mobile::kLOG_DEBUG, "DASJFDAFJ%d -- %f", 12345, 344.234);
+    LOGF(paddle_mobile::kLOG_DEBUG, "DASJFDAFJ%d -- %f", 12345, 344.234);

-    LOG(paddle_mobile::kLOG_DEBUG) << "test debug" << " next log";
+    LOG(paddle_mobile::kLOG_DEBUG) << "test debug"
+                                   << " next log";

    LOG(paddle_mobile::kLOG_DEBUG1) << "test debug1"
                                    << " next log";

--- a/test/elementwise_add_op_test.h
+++ b/test/elementwise_add_op_test.h
@@ -21,149 +21,144 @@ SOFTWARE.
 #include "test_include.h"

 namespace paddle_mobile {
-    namespace framework {
-
-        template <typename Dtype> class TestElementwiseAddOp {
-          public:
-            TestElementwiseAddOp(const Program<Dtype> p) : program_(p) {
-                if (use_optimize_) {
-                    to_predict_program_ = program_.optimizeProgram;
-                } else {
-                    to_predict_program_ = program_.originProgram;
-                }
-
-                const std::vector<std::shared_ptr<BlockDesc>> blocks =
-                    to_predict_program_->Blocks();
-                //  DLOG << " **block size " << blocks.size();
-                for (int i = 0; i < blocks.size(); ++i) {
-                    std::shared_ptr<BlockDesc> block_desc = blocks[i];
-                    std::vector<std::shared_ptr<OpDesc>> ops =
-                        block_desc->Ops();
-                    //    DLOG << " ops " << ops.size();
-                    for (int j = 0; j < ops.size(); ++j) {
-                        std::shared_ptr<OpDesc> op = ops[j];
-                        //                        if (op->Type() ==
-                        //                        "elementwise_add") {
-                        //                            if
-                        //                            (op->GetAttrMap().at("axis").Get<int>()
-                        //                            != -1) {
-                        //                                DLOG << "attr: axis =
-                        //                                "
-                        //                                     <<
-                        //                                     op->GetAttrMap().at("axis").Get<int>();
-                        //                            }
-                        //                        }
-                        //                        DLOG << "op:" << op->Type();
-                        if (op->Type() == "elementwise_add" &&
-                            op->Input("X")[0] == "batch_norm_2.tmp_2") {
-                            DLOG << " elementwise_add attr size: "
-                                 << op->GetAttrMap().size();
-                            DLOG << " inputs size: " << op->GetInputs().size();
-                            DLOG << " outputs size: "
-                                 << op->GetOutputs().size();
-                            DLOG << " Input X is : " << op->Input("X")[0];
-                            DLOG << " Input Y is : " << op->Input("Y")[0];
-                            DLOG << " Output Out is : " << op->Output("Out")[0];
-                            Attribute axis_attr = op->GetAttrMap().at("axis");
-                            int axis = axis_attr.Get<int>();
-                            DLOG << " Attr axis is : " << axis;
-
-                            std::shared_ptr<
-                                operators::ElementwiseAddOp<Dtype, float>>
-                                add = std::make_shared<
-                                    operators::ElementwiseAddOp<Dtype, float>>(
-                                    op->Type(), op->GetInputs(),
-                                    op->GetOutputs(), op->GetAttrMap(),
-                                    program_.scope);
-                            ops_of_block_[*block_desc.get()].push_back(add);
-                        }
-                    }
-                }
-            }
-
-            std::shared_ptr<Tensor> predict_add(Tensor &t1, Tensor &t2) {
-                // feed
-                auto scope = program_.scope;
-                Variable *x_feed_value = scope->Var("batch_norm_2.tmp_2");
-                auto tensor_x = x_feed_value->GetMutable<Tensor>();
-                tensor_x->ShareDataWith(t1);
-
-                Variable *y_feed_value = scope->Var("batch_norm_0.tmp_3");
-                auto tensor_y = y_feed_value->GetMutable<Tensor>();
-                tensor_y->ShareDataWith(t2);
-
-                Variable *con_output = scope->Var("elementwise_add_0.tmp_0");
-                Tensor *output_tensor = con_output->GetMutable<Tensor>();
-                output_tensor->mutable_data<float>({1, 3, 224, 224});
-                //  DLOG << typeid(output_tensor).name();
-                //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-                std::shared_ptr<Tensor> out_tensor =
-                    std::make_shared<LoDTensor>();
-                out_tensor.reset(output_tensor);
-
-                predict_add(t1, t2, 0);
-                return out_tensor;
-            }
+namespace framework {
+
+template <typename Dtype> class TestElementwiseAddOp {
+  public:
+    TestElementwiseAddOp(const Program<Dtype> p) : program_(p) {
+        if (use_optimize_) {
+            to_predict_program_ = program_.optimizeProgram;
+        } else {
+            to_predict_program_ = program_.originProgram;
+        }

-          private:
-            const framework::Program<Dtype> program_;
-            std::shared_ptr<ProgramDesc> to_predict_program_;
-            std::map<framework::BlockDesc,
-                     std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-                ops_of_block_;
-            bool use_optimize_ = false;
-
-            void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
-                std::shared_ptr<BlockDesc> to_predict_block =
-                    to_predict_program_->Block(block_id);
-                for (int j = 0;
-                     j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-                    auto op = ops_of_block_[*to_predict_block.get()][j];
-                    DLOG << "op -> run()";
-                    op->Run();
+        const std::vector<std::shared_ptr<BlockDesc>> blocks =
+            to_predict_program_->Blocks();
+        //  DLOG << " **block size " << blocks.size();
+        for (int i = 0; i < blocks.size(); ++i) {
+            std::shared_ptr<BlockDesc> block_desc = blocks[i];
+            std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+            //    DLOG << " ops " << ops.size();
+            for (int j = 0; j < ops.size(); ++j) {
+                std::shared_ptr<OpDesc> op = ops[j];
+                //                        if (op->Type() ==
+                //                        "elementwise_add") {
+                //                            if
+                //                            (op->GetAttrMap().at("axis").Get<int>()
+                //                            != -1) {
+                //                                DLOG << "attr: axis =
+                //                                "
+                //                                     <<
+                //                                     op->GetAttrMap().at("axis").Get<int>();
+                //                            }
+                //                        }
+                //                        DLOG << "op:" << op->Type();
+                if (op->Type() == "elementwise_add" &&
+                    op->Input("X")[0] == "batch_norm_2.tmp_2") {
+                    DLOG << " elementwise_add attr size: "
+                         << op->GetAttrMap().size();
+                    DLOG << " inputs size: " << op->GetInputs().size();
+                    DLOG << " outputs size: " << op->GetOutputs().size();
+                    DLOG << " Input X is : " << op->Input("X")[0];
+                    DLOG << " Input Y is : " << op->Input("Y")[0];
+                    DLOG << " Output Out is : " << op->Output("Out")[0];
+                    Attribute axis_attr = op->GetAttrMap().at("axis");
+                    int axis = axis_attr.Get<int>();
+                    DLOG << " Attr axis is : " << axis;
+
+                    std::shared_ptr<operators::ElementwiseAddOp<Dtype, float>>
+                        add = std::make_shared<
+                            operators::ElementwiseAddOp<Dtype, float>>(
+                            op->Type(), op->GetInputs(), op->GetOutputs(),
+                            op->GetAttrMap(), program_.scope);
+                    ops_of_block_[*block_desc.get()].push_back(add);
                }
            }
-        };
-
-        template class TestElementwiseAddOp<CPU>;
-    } // namespace framework
-
-    namespace test {
-        void testElementwiseAdd() {
-            DLOG << "----------**********----------";
-            DLOG << "begin to run ElementAddOp Test";
-            paddle_mobile::Loader<paddle_mobile::CPU> loader;
-            auto program = loader.Load(
-                std::string("../../test/models/"
-                            "image_classification_resnet.inference.model"));
-
-            /// input x (1,3,224,224)
-            paddle_mobile::framework::Tensor inputx;
-            SetupTensor<float>(&inputx, {1, 3, 224, 224}, static_cast<float>(0),
-                               static_cast<float>(1));
-            float *inputx_ptr = inputx.data<float>();
-            /// input y (224,)
-            paddle_mobile::framework::Tensor inputy;
-            SetupTensor<float>(&inputy, {224}, static_cast<float>(0),
-                               static_cast<float>(1));
-            float *inputy_ptr = inputy.data<float>();
-
-            paddle_mobile::framework::TestElementwiseAddOp<paddle_mobile::CPU>
-                testElementwiseAddOp(program);
-
-            auto output_add = testElementwiseAddOp.predict_add(inputx, inputy);
-            float *output_add_ptr = output_add->data<float>();
-            //            for (int j = 0; j < output_add->numel(); ++j) {
-            //                DLOG << "value of output: " << output_add_ptr[j];
-            //            }
-
-            /// output (1,3,224,224)
-            DLOG << "output memory size : " << output_add->memory_size();
-            DLOG << "output numel : " << output_add->numel();
-
-            DLOG << inputx_ptr[226] << " + " << inputy_ptr[2] << " = "
-                 << output_add_ptr[226];
        }
-    } // namespace test
+    }
+
+    std::shared_ptr<Tensor> predict_add(Tensor &t1, Tensor &t2) {
+        // feed
+        auto scope = program_.scope;
+        Variable *x_feed_value = scope->Var("batch_norm_2.tmp_2");
+        auto tensor_x = x_feed_value->GetMutable<Tensor>();
+        tensor_x->ShareDataWith(t1);
+
+        Variable *y_feed_value = scope->Var("batch_norm_0.tmp_3");
+        auto tensor_y = y_feed_value->GetMutable<Tensor>();
+        tensor_y->ShareDataWith(t2);
+
+        Variable *con_output = scope->Var("elementwise_add_0.tmp_0");
+        Tensor *output_tensor = con_output->GetMutable<Tensor>();
+        output_tensor->mutable_data<float>({1, 3, 224, 224});
+        //  DLOG << typeid(output_tensor).name();
+        //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+        std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+        out_tensor.reset(output_tensor);
+
+        predict_add(t1, t2, 0);
+        return out_tensor;
+    }
+
+  private:
+    const framework::Program<Dtype> program_;
+    std::shared_ptr<ProgramDesc> to_predict_program_;
+    std::map<framework::BlockDesc,
+             std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+        ops_of_block_;
+    bool use_optimize_ = false;
+
+    void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
+        std::shared_ptr<BlockDesc> to_predict_block =
+            to_predict_program_->Block(block_id);
+        for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size();
+             ++j) {
+            auto op = ops_of_block_[*to_predict_block.get()][j];
+            DLOG << "op -> run()";
+            op->Run();
+        }
+    }
+};
+
+template class TestElementwiseAddOp<CPU>;
+} // namespace framework
+
+namespace test {
+void testElementwiseAdd() {
+    DLOG << "----------**********----------";
+    DLOG << "begin to run ElementAddOp Test";
+    paddle_mobile::Loader<paddle_mobile::CPU> loader;
+    auto program =
+        loader.Load(std::string("../../test/models/"
+                                "image_classification_resnet.inference.model"));
+
+    /// input x (1,3,224,224)
+    paddle_mobile::framework::Tensor inputx;
+    SetupTensor<float>(&inputx, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+    float *inputx_ptr = inputx.data<float>();
+    /// input y (224,)
+    paddle_mobile::framework::Tensor inputy;
+    SetupTensor<float>(&inputy, {224}, static_cast<float>(0),
+                       static_cast<float>(1));
+    float *inputy_ptr = inputy.data<float>();
+
+    paddle_mobile::framework::TestElementwiseAddOp<paddle_mobile::CPU>
+        testElementwiseAddOp(program);
+
+    auto output_add = testElementwiseAddOp.predict_add(inputx, inputy);
+    float *output_add_ptr = output_add->data<float>();
+    //            for (int j = 0; j < output_add->numel(); ++j) {
+    //                DLOG << "value of output: " << output_add_ptr[j];
+    //            }
+
+    /// output (1,3,224,224)
+    DLOG << "output memory size : " << output_add->memory_size();
+    DLOG << "output numel : " << output_add->numel();
+
+    DLOG << inputx_ptr[226] << " + " << inputy_ptr[2] << " = "
+         << output_add_ptr[226];
+}
+} // namespace test
 } // namespace paddle_mobile
--- a/test/mul_op_test.h
+++ b/test/mul_op_test.h
@@ -21,182 +21,170 @@ SOFTWARE.
 #include "test_include.h"

 namespace paddle_mobile {
-    namespace framework {
-
-        template <typename Dtype> class TestMulOp {
-          public:
-            TestMulOp(const Program<Dtype> p) : program_(p) {
-                if (use_optimize_) {
-                    to_predict_program_ = program_.optimizeProgram;
-                } else {
-                    to_predict_program_ = program_.originProgram;
-                }
-
-                const std::vector<std::shared_ptr<BlockDesc>> blocks =
-                    to_predict_program_->Blocks();
-                //  DLOG << " **block size " << blocks.size();
-                for (int i = 0; i < blocks.size(); ++i) {
-                    std::shared_ptr<BlockDesc> block_desc = blocks[i];
-                    std::vector<std::shared_ptr<OpDesc>> ops =
-                        block_desc->Ops();
-                    //    DLOG << " ops " << ops.size();
-                    for (int j = 0; j < ops.size(); ++j) {
-                        std::shared_ptr<OpDesc> op = ops[j];
-                        //                        if (op->Type() == "mul") {
-                        //                            DLOG << "x_num_col_dims :
-                        //                            "
-                        //                                 << op->GetAttrMap()
-                        //                                        .at("x_num_col_dims")
-                        //                                        .Get<int>();
-                        //                            DLOG << "y_num_col_dims :
-                        //                            "
-                        //                                 << op->GetAttrMap()
-                        //                                        .at("y_num_col_dims")
-                        //                                        .Get<int>();
-                        //                            DLOG << " Input X is : "
-                        //                            << op->Input("X")[0];
-                        //                        }
-                        //                        DLOG << "op:" << op->Type();
-                        if (op->Type() == "mul" &&
-                            op->Input("X")[0] == "pool2d_0.tmp_0") {
-                            DLOG << " mul attr size: "
-                                 << op->GetAttrMap().size();
-                            DLOG << " inputs size: " << op->GetInputs().size();
-                            DLOG << " outputs size: "
-                                 << op->GetOutputs().size();
-                            DLOG << " Input X is : " << op->Input("X")[0];
-                            DLOG << " Input Y is : " << op->Input("Y")[0];
-                            DLOG << " Output Out is : " << op->Output("Out")[0];
-                            DLOG << "x_num_col_dims : "
-                                 << op->GetAttrMap()
-                                        .at("x_num_col_dims")
-                                        .Get<int>();
-                            DLOG << "y_num_col_dims : "
-                                 << op->GetAttrMap()
-                                        .at("y_num_col_dims")
-                                        .Get<int>();
-
-                            std::shared_ptr<operators::MulOp<Dtype, float>>
-                                add = std::make_shared<
-                                    operators::MulOp<Dtype, float>>(
-                                    op->Type(), op->GetInputs(),
-                                    op->GetOutputs(), op->GetAttrMap(),
-                                    program_.scope);
-                            ops_of_block_[*block_desc.get()].push_back(add);
-                        }
-                    }
-                }
-            }
-
-            std::shared_ptr<Tensor> predict_add(Tensor &t1, Tensor &t2) {
-                // feed
-                auto scope = program_.scope;
-                Variable *x_feed_value = scope->Var("pool2d_0.tmp_0");
-                auto tensor_x = x_feed_value->GetMutable<Tensor>();
-                tensor_x->ShareDataWith(t1);
-
-                Variable *y_feed_value = scope->Var("fc_0.w_0");
-                auto tensor_y = y_feed_value->GetMutable<Tensor>();
-                tensor_y->ShareDataWith(t2);
-
-                Variable *con_output = scope->Var("fc_0.tmp_0");
-                Tensor *output_tensor = con_output->GetMutable<Tensor>();
-                output_tensor->mutable_data<float>({3, 3});
-                //  DLOG << typeid(output_tensor).name();
-                //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-                std::shared_ptr<Tensor> out_tensor =
-                    std::make_shared<LoDTensor>();
-                out_tensor.reset(output_tensor);
-
-                predict_add(t1, t2, 0);
-                return out_tensor;
-            }
-
-          private:
-            const framework::Program<Dtype> program_;
-            std::shared_ptr<ProgramDesc> to_predict_program_;
-            std::map<framework::BlockDesc,
-                     std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-                ops_of_block_;
-            bool use_optimize_ = false;
-
-            void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
-                std::shared_ptr<BlockDesc> to_predict_block =
-                    to_predict_program_->Block(block_id);
-                for (int j = 0;
-                     j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-                    auto op = ops_of_block_[*to_predict_block.get()][j];
-                    DLOG << "op -> run()";
-                    op->Run();
-                }
-            }
-        };
-
-        template class TestMulOp<CPU>;
-    } // namespace framework
-
-    namespace test {
-        void testMul() {
-            DLOG << "----------**********----------";
-            DLOG << "begin to run MulOp Test";
-            paddle_mobile::Loader<paddle_mobile::CPU> loader;
-            auto program = loader.Load(
-                std::string("../../test/models/"
-                            "image_classification_resnet.inference.model"));
-
-            /// input x (3,2,1,1)
-            paddle_mobile::framework::Tensor inputx;
-            SetupTensor<float>(&inputx, {3, 2, 1, 1}, static_cast<float>(0),
-                               static_cast<float>(1));
-            float *inputx_ptr = inputx.data<float>();
-
-            /// input y (2,3)
-            paddle_mobile::framework::Tensor inputy;
-            SetupTensor<float>(&inputy, {2, 3}, static_cast<float>(0),
-                               static_cast<float>(1));
-            float *inputy_ptr = inputy.data<float>();
-
-            paddle_mobile::framework::TestMulOp<paddle_mobile::CPU> testMulOp(
-                program);
-
-            auto output_mul = testMulOp.predict_add(inputx, inputy);
-            float *output_mul_ptr = output_mul->data<float>();
-
-            auto dimx_1 = inputx.numel() / inputx.dims()[0];
-            DLOG << " inputx : ";
-            for (int i = 0; i < inputx.dims()[0]; ++i) {
-                for (int j = 0; j < dimx_1; ++j) {
-                    DLOGF("%f ", inputx_ptr[i * dimx_1 + j]);
-                }
-                DLOGF("\n");
-            }
-
-            auto dimy_1 = inputy.numel() / inputy.dims()[0];
-            DLOG << " inputy : ";
-            for (int i = 0; i < inputy.dims()[0]; ++i) {
-                for (int j = 0; j < dimy_1; ++j) {
-                    DLOGF("%f ", inputy_ptr[i * dimx_1 + j]);
-                }
-                DLOGF("\n");
-            }
+namespace framework {
+
+template <typename Dtype> class TestMulOp {
+  public:
+    TestMulOp(const Program<Dtype> p) : program_(p) {
+        if (use_optimize_) {
+            to_predict_program_ = program_.optimizeProgram;
+        } else {
+            to_predict_program_ = program_.originProgram;
+        }

-            auto dim_output_1 = output_mul->numel() / output_mul->dims()[0];
-            DLOG << " output : ";
-            for (int i = 0; i < output_mul->dims()[0]; ++i) {
-                for (int j = 0; j < dim_output_1; ++j) {
-                    DLOGF("%f ", output_mul_ptr[i * dimy_1 + j]);
+        const std::vector<std::shared_ptr<BlockDesc>> blocks =
+            to_predict_program_->Blocks();
+        //  DLOG << " **block size " << blocks.size();
+        for (int i = 0; i < blocks.size(); ++i) {
+            std::shared_ptr<BlockDesc> block_desc = blocks[i];
+            std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+            //    DLOG << " ops " << ops.size();
+            for (int j = 0; j < ops.size(); ++j) {
+                std::shared_ptr<OpDesc> op = ops[j];
+                //                        if (op->Type() == "mul") {
+                //                            DLOG << "x_num_col_dims :
+                //                            "
+                //                                 << op->GetAttrMap()
+                //                                        .at("x_num_col_dims")
+                //                                        .Get<int>();
+                //                            DLOG << "y_num_col_dims :
+                //                            "
+                //                                 << op->GetAttrMap()
+                //                                        .at("y_num_col_dims")
+                //                                        .Get<int>();
+                //                            DLOG << " Input X is : "
+                //                            << op->Input("X")[0];
+                //                        }
+                //                        DLOG << "op:" << op->Type();
+                if (op->Type() == "mul" &&
+                    op->Input("X")[0] == "pool2d_0.tmp_0") {
+                    DLOG << " mul attr size: " << op->GetAttrMap().size();
+                    DLOG << " inputs size: " << op->GetInputs().size();
+                    DLOG << " outputs size: " << op->GetOutputs().size();
+                    DLOG << " Input X is : " << op->Input("X")[0];
+                    DLOG << " Input Y is : " << op->Input("Y")[0];
+                    DLOG << " Output Out is : " << op->Output("Out")[0];
+                    DLOG << "x_num_col_dims : "
+                         << op->GetAttrMap().at("x_num_col_dims").Get<int>();
+                    DLOG << "y_num_col_dims : "
+                         << op->GetAttrMap().at("y_num_col_dims").Get<int>();
+
+                    std::shared_ptr<operators::MulOp<Dtype, float>> add =
+                        std::make_shared<operators::MulOp<Dtype, float>>(
+                            op->Type(), op->GetInputs(), op->GetOutputs(),
+                            op->GetAttrMap(), program_.scope);
+                    ops_of_block_[*block_desc.get()].push_back(add);
                }
-                DLOGF("\n");
            }
+        }
+    }
+
+    std::shared_ptr<Tensor> predict_add(Tensor &t1, Tensor &t2) {
+        // feed
+        auto scope = program_.scope;
+        Variable *x_feed_value = scope->Var("pool2d_0.tmp_0");
+        auto tensor_x = x_feed_value->GetMutable<Tensor>();
+        tensor_x->ShareDataWith(t1);
+
+        Variable *y_feed_value = scope->Var("fc_0.w_0");
+        auto tensor_y = y_feed_value->GetMutable<Tensor>();
+        tensor_y->ShareDataWith(t2);
+
+        Variable *con_output = scope->Var("fc_0.tmp_0");
+        Tensor *output_tensor = con_output->GetMutable<Tensor>();
+        output_tensor->mutable_data<float>({3, 3});
+        //  DLOG << typeid(output_tensor).name();
+        //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+        std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+        out_tensor.reset(output_tensor);
+
+        predict_add(t1, t2, 0);
+        return out_tensor;
+    }
+
+  private:
+    const framework::Program<Dtype> program_;
+    std::shared_ptr<ProgramDesc> to_predict_program_;
+    std::map<framework::BlockDesc,
+             std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+        ops_of_block_;
+    bool use_optimize_ = false;
+
+    void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
+        std::shared_ptr<BlockDesc> to_predict_block =
+            to_predict_program_->Block(block_id);
+        for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size();
+             ++j) {
+            auto op = ops_of_block_[*to_predict_block.get()][j];
+            DLOG << "op -> run()";
+            op->Run();
+        }
+    }
+};
+
+template class TestMulOp<CPU>;
+} // namespace framework
+
+namespace test {
+void testMul() {
+    DLOG << "----------**********----------";
+    DLOG << "begin to run MulOp Test";
+    paddle_mobile::Loader<paddle_mobile::CPU> loader;
+    auto program =
+        loader.Load(std::string("../../test/models/"
+                                "image_classification_resnet.inference.model"));
+
+    /// input x (3,2,1,1)
+    paddle_mobile::framework::Tensor inputx;
+    SetupTensor<float>(&inputx, {3, 2, 1, 1}, static_cast<float>(0),
+                       static_cast<float>(1));
+    float *inputx_ptr = inputx.data<float>();
+
+    /// input y (2,3)
+    paddle_mobile::framework::Tensor inputy;
+    SetupTensor<float>(&inputy, {2, 3}, static_cast<float>(0),
+                       static_cast<float>(1));
+    float *inputy_ptr = inputy.data<float>();
+
+    paddle_mobile::framework::TestMulOp<paddle_mobile::CPU> testMulOp(program);
+
+    auto output_mul = testMulOp.predict_add(inputx, inputy);
+    float *output_mul_ptr = output_mul->data<float>();
+
+    auto dimx_1 = inputx.numel() / inputx.dims()[0];
+    DLOG << " inputx : ";
+    for (int i = 0; i < inputx.dims()[0]; ++i) {
+        for (int j = 0; j < dimx_1; ++j) {
+            DLOGF("%f ", inputx_ptr[i * dimx_1 + j]);
+        }
+        DLOGF("\n");
+    }
+
+    auto dimy_1 = inputy.numel() / inputy.dims()[0];
+    DLOG << " inputy : ";
+    for (int i = 0; i < inputy.dims()[0]; ++i) {
+        for (int j = 0; j < dimy_1; ++j) {
+            DLOGF("%f ", inputy_ptr[i * dimx_1 + j]);
+        }
+        DLOGF("\n");
+    }
+
+    auto dim_output_1 = output_mul->numel() / output_mul->dims()[0];
+    DLOG << " output : ";
+    for (int i = 0; i < output_mul->dims()[0]; ++i) {
+        for (int j = 0; j < dim_output_1; ++j) {
+            DLOGF("%f ", output_mul_ptr[i * dimy_1 + j]);
+        }
+        DLOGF("\n");
+    }

-            /// output (3,3)
-            DLOG << "output memory size : " << output_mul->memory_size();
-            DLOG << "output numel : " << output_mul->numel();
+    /// output (3,3)
+    DLOG << "output memory size : " << output_mul->memory_size();
+    DLOG << "output numel : " << output_mul->numel();

-            DLOG << inputx_ptr[0] << " x " << inputy_ptr[0] << " + "
-                 << inputx_ptr[1] << " x " << inputy_ptr[0 + 3] << " = "
-                 << output_mul_ptr[0];
-        }
-    } // namespace test
+    DLOG << inputx_ptr[0] << " x " << inputy_ptr[0] << " + " << inputx_ptr[1]
+         << " x " << inputy_ptr[0 + 3] << " = " << output_mul_ptr[0];
+}
+} // namespace test
 } // namespace paddle_mobile
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -7,7 +7,6 @@
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "framework/variable.h"
 #include "io.h"
 #include "test_helper.h"
 #include <map>

--- a/tools/pre-commit.hooks/.clang-format.hook
+++ b/tools/pre-commit.hooks/.clang-format.hook
+#!/bin/bash
+#set -e
+#
+#readonly VERSION="3.8"
+#
+#version=$(clang-format -version)
+#
+#if ! [[ $version == *"$VERSION"* ]]; then
+#    echo "clang-format version check failed."
+#    echo "a version contains '$VERSION' is needed, but get '$version'"
+#    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+#    exit -1
+#fi
+
+clang-format $@
--- a/tools/pre-commit.hooks/.clang-tidy.hook
+++ b/tools/pre-commit.hooks/.clang-tidy.hook
 #!/bin/bash

-echo "allonli clang-tidy formating init"
-
 cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-clang-foramt -version
-clang-tidy -version
-python ./tools/pre-commit.hooks/run-clang-tidy.py
+python ./tools/pre-commit.hooks/run-clang-tidy.py -header-filter=third-party/
 #TOTAL_ERRORS=0

 # The trick to remove deleted files: https://stackoverflow.com/a/2413151