Merge remote-tracking branch 'gitlab/develop' into incubate/lite

8642fb8a · tensor-tang · 321e2a28 · 02029900 · 8642fb8a · 8642fb8a
36 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,10 @@ paddle/fluid/operators/distributed/send_recv.proto
 *.vs
 build/
 build_doc/
+build.*
 *.user
+*.sh
+*.bkp

 .vscode
 .idea

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    if(NOT DEFINED TARGET_ARCH_ABI)
        set(ARCH_ABI "arm64-v8a" CACHE STRING "Choose android platform")
    endif()
-
+    
    include(cross_compiling/host)
    include(cross_compiling/armlinux)
    include(cross_compiling/android)

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_desc.h"
-#include <glog/logging.h>
 #include <algorithm>
 #include <functional>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"

--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -32,9 +32,9 @@ void Run(const char* model_dir) {
                  valid_places);

  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({3, 224, 224})));
  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
+  for (int i = 0; i < 3 * 224 * 224; i++) {
    data[i] = i;
  }

@@ -65,6 +65,14 @@ USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);

+USE_LITE_OP(con2d);
+// USE_LITE_OP(batch_norm);
+USE_LITE_OP(relu);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(pool2d);
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(softmax);
+
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);

@@ -72,7 +80,15 @@ USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+
+USE_LITE_KERNEL(con2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_con2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+
 // USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
 // USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
 #endif  // LITE_WITH_ARM

--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -72,8 +72,9 @@ class LightPredictor {

    // Create the kernels of the target places, and filter out the specific
    // kernel with the target alias.
-    for (auto& op : program.ops()) {
-      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+    for (auto& op : program.ops_) {
+      lite::pb::OpDesc desc(op->op_info()->desc());
+      auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
      std::string op_type, alias;
      Place place;
      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
@@ -88,8 +89,8 @@ class LightPredictor {
      insts.emplace_back(op, std::move(*it));
    }
    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope());
+    CHECK(program.exec_scope_);
+    program_->set_exec_scope(program.exec_scope_);
  }

 private:

--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -6,4 +6,31 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
  return()
 endif()

-cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc scale.cc elementwise.cc DEPS ${lite_kernel_deps} eigen3)
+# TODO(xxx): seperate them
+cc_library(math_arm SRCS  
+    funcs.cc 
+    packed_sgemm.cc 
+    softmax.cc 
+    scale.cc
+    pooling.cc
+    elementwise.cc
+    sgemv.cc
+    type_trans.cpp
+    conv_impl.cc
+    conv_direct_3x3s1.cc
+    conv_direct_3x3s2.cc
+    conv_direct.cc
+    conv_depthwise_3x3_int7.cc
+    conv_depthwise_3x3_int8.cc
+    conv_depthwise_5x5s1_int8.cc
+    conv_depthwise_3x3p0.cc
+    conv_depthwise_3x3p1.cc
+    conv_depthwise_5x5s1.cc
+    conv_depthwise_5x5s2.cc
+    conv_depthwise.cc
+    conv_gemmlike.cc
+    conv_winograd_3x3.cc
+    conv_winograd.cc
+    split.cc
+    DEPS ${lite_kernel_deps} eigen3)
+ 
--- a/paddle/fluid/lite/arm/math/pooling.cc
+++ b/paddle/fluid/lite/arm/math/pooling.cc
--- a/paddle/fluid/lite/arm/math/pooling.h
+++ b/paddle/fluid/lite/arm/math/pooling.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+// !pooling fp32 Op
+void pooling_basic(const void* din, void* dout, int num, int chout, int hout,
+                   int wout, int chin, int hin, int win,
+                   const std::vector<int>& ksize,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings, bool global_pooling,
+                   bool exclusive, bool adaptive, bool ceil_mode,
+                   bool use_quantizer, const std::string& pooling_type);
+
+void pooling_global(const void* din, void* dout, int num, int chout, int hout,
+                    int wout, int chin, int hin, int win,
+                    const std::vector<int>& ksize,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings, bool global_pooling,
+                    bool exclusive, bool adaptive, bool ceil_mode,
+                    bool use_quantizer, const std::string& pooling_type);
+
+void pooling2x2s2_max(const void* din, void* dout, int num, int chout, int hout,
+                      int wout, int chin, int hin, int win,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, bool global_pooling,
+                      bool exclusive, bool adaptive, bool ceil_mode,
+                      bool use_quantizer, const std::string& pooling_type);
+
+void pooling2x2s2_ave(const void* din, void* dout, int num, int chout, int hout,
+                      int wout, int chin, int hin, int win,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, bool global_pooling,
+                      bool exclusive, bool adaptive, bool ceil_mode,
+                      bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s1p1_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s1p1_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p1_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p0_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p1_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p0_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/split.cc
+++ b/paddle/fluid/lite/arm/math/split.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/arm/math/split.h"
+#include <algorithm>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void split_cpy<float>(const float* din, float* dout, int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const float* din_ptr = din + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+
+    float32x4_t din0 = vld1q_f32(din_ptr);
+    float32x4_t din1 = vld1q_f32(din_ptr + 4);
+    float32x4_t din2 = vld1q_f32(din_ptr + 8);
+    float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+    vst1q_f32(dout_ptr, din0);
+    vst1q_f32(dout_ptr + 4, din1);
+    vst1q_f32(dout_ptr + 8, din2);
+    vst1q_f32(dout_ptr + 12, din3);
+  }
+  if (remain > 0) {
+    const float* din_ptr = din + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+template <>
+void split<float>(const float* din, std::vector<lite::Tensor*>* dout,
+                  const int axis, const std::vector<int>& in_strides) {
+  int input_offset = 0;
+  for (auto out : *dout) {
+    auto out_dim = out->dims();
+    std::vector<int> out_strides(out_dim.size());
+    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
+    for (int i = out_dim.size() - 2; i >= 0; --i) {
+      out_strides[i] = out_strides[i + 1] * out_dim[i];
+    }
+
+    float* out_data = out->mutable_data<float>();
+    int before = out_strides[0] / out_strides[axis];
+    int in_after = in_strides[axis];
+    int out_after = out_strides[axis];
+
+    for (int i = 0; i < before; ++i) {
+      split_cpy(din + input_offset + i * in_after, out_data + i * out_after,
+                out_after);
+    }
+    input_offset += out_strides[axis];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/split.h
+++ b/paddle/fluid/lite/arm/math/split.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void split_cpy(const T* din, T* dout, int num);
+
+template <typename T>
+void split(const T* din, std::vector<lite::Tensor*>* dout, const int axis,
+           const std::vector<int>& in_strides);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/type_trans.cpp
+++ b/paddle/fluid/lite/arm/math/type_trans.cpp
--- a/paddle/fluid/lite/core/memory.h
+++ b/paddle/fluid/lite/core/memory.h
@@ -65,6 +65,8 @@ class Buffer {
    TargetCopy(target_, data_, other.data_, nbytes);
  }

+  ~Buffer() { Free(); }
+
 private:
  // memory it actually malloced.
  size_t space_{0};

--- a/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/pattern_matcher.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void BuildGraph(SSAGraph* g) {
+  g->mutable_nodes().emplace_back();
+  Node& o1 = g->mutable_nodes().back();
+  o1.AsStmt().op_type = "op1";
+  g->mutable_nodes().emplace_back();
+  Node& o2 = g->mutable_nodes().back();
+  o2.AsStmt().op_type = "op2";
+  g->mutable_nodes().emplace_back();
+  Node& o3 = g->mutable_nodes().back();
+  o3.AsStmt().op_type = "op3";
+  g->mutable_nodes().emplace_back();
+  Node& o4 = g->mutable_nodes().back();
+  o4.AsStmt().op_type = "op4";
+  g->mutable_nodes().emplace_back();
+  Node& o5 = g->mutable_nodes().back();
+  o5.AsStmt().op_type = "op5";
+  g->mutable_nodes().emplace_back();
+  Node& v1 = g->mutable_nodes().back();
+  v1.AsArg("var1");
+  g->mutable_nodes().emplace_back();
+  Node& v2 = g->mutable_nodes().back();
+  v2.AsArg("var2");
+  g->mutable_nodes().emplace_back();
+  Node& v3 = g->mutable_nodes().back();
+  v3.AsArg("var3");
+  g->mutable_nodes().emplace_back();
+  Node& v4 = g->mutable_nodes().back();
+  v4.AsArg("var4");
+
+  // o1->v1->o2
+  o1.outlinks.push_back(&v1);
+  o2.inlinks.push_back(&v1);
+  v1.inlinks.push_back(&o1);
+  v1.outlinks.push_back(&o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2.outlinks.push_back(&v2);
+  o3.inlinks.push_back(&v2);
+  o4.inlinks.push_back(&v2);
+  v2.inlinks.push_back(&o2);
+  v2.outlinks.push_back(&o3);
+  v2.outlinks.push_back(&o4);
+  // o2->v3->o5
+  o2.outlinks.push_back(&v3);
+  o5.inlinks.push_back(&v3);
+  v3.inlinks.push_back(&o2);
+  v3.outlinks.push_back(&o5);
+  // o3-v4->o5
+  o3.outlinks.push_back(&v4);
+  o5.inlinks.push_back(&v4);
+  v4.inlinks.push_back(&o3);
+  v4.outlinks.push_back(&o5);
+}
+
+TEST(PMPattern, NewNode) {
+  PMPattern x;
+  auto* n = x.NewNode([](const Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+
+TEST(PMPattern, AddEdge) {
+  PMPattern x;
+  auto* a = x.NewNode([](const Node* x) { return true; });
+  auto* b = x.NewNode([](const Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+
+TEST(PatternMatcher, MarkPMNodesInGraph) {
+  PatternMatcher x;
+  // mark o2, o3, v2
+
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsStmt() && node->stmt()->op_type == "op2";
+  });
+  auto* o3 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsStmt() && node->stmt()->op_type == "op3";
+  });
+  auto* v2 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsArg() && node->arg()->name == "var2";
+  });
+
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  x.MarkPMNodesInGraph(&graph);
+
+  ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL);
+
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+
+TEST(PatternMatcher, MultiSubgraph) {
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  PatternMatcher x;
+
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](const Node* node) {
+        return node->IsStmt() && (node->stmt()->op_type == "op2" ||
+                                  node->stmt()->op_type == "op3");
+      },
+      "OP0");
+  auto* any_var =
+      x.mutable_pattern()
+          ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR")
+          ->AsIntermediate();
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](const Node* node) { return node->IsStmt(); }, "OP1");
+
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+
+  int count = 0;
+  PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
+                                        SSAGraph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
+              << s.at(any_var)->arg()->name << " -> "
+              << s.at(any_op1)->stmt()->op_type;
+    count++;
+  };
+
+  x(&graph, handle);
+
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1);
+  ASSERT_LE(count, 2);
+}
+
+TEST(PatternMatcher, IntermediateCheck) {
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  // o2->v2->o3
+  // o2->v2->o4
+  // check o2+o3 fuse, should fail because v2 also link to o4.
+  PatternMatcher matcher;
+  auto* op2 = matcher.mutable_pattern()->NewNode(
+      [](const Node* x) {
+        return x && x->IsStmt() && x->stmt()->op_type == "op2";
+      },
+      "op2");
+  auto* op3 = matcher.mutable_pattern()->NewNode(
+      [](const Node* x) {
+        return x && x->IsStmt() && x->stmt()->op_type == "op3";
+      },
+      "op3");
+  auto* v2 = matcher.mutable_pattern()
+                 ->NewNode(
+                     [](const Node* x) {
+                       return x && x->IsArg() && x->arg()->name == "var2";
+                     },
+                     "var2")
+                 ->AsIntermediate();
+  v2->LinksFrom({op2}).LinksTo({op3});
+
+  int count = 0;
+  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
+    ++count;
+  });
+  EXPECT_EQ(count, 0);
+
+  count = 0;
+  v2->AsInput();
+  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
+    ++count;
+  });
+  ASSERT_EQ(count, 1);
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/core/op_registry.h
+++ b/paddle/fluid/lite/core/op_registry.h
@@ -91,9 +91,9 @@ class KernelRegistry final {
  void Register(const std::string &name,
                typename KernelRegistryForTarget<Target, Precision,
                                                 Layout>::creator_t &&creator) {
-    // VLOG(3) << "register for " << TargetToStr(Target) << ":"
-    //<< PrecisionToStr(Precision) << "//"
-    //<< GetKernelOffset<Target, Precision, Layout>();
+    VLOG(3) << "register for " << TargetToStr(Target) << ":"
+            << PrecisionToStr(Precision) << "//"
+            << GetKernelOffset<Target, Precision, Layout>();
    using kernel_registor_t =
        KernelRegistryForTarget<Target, Precision, Layout>;
    auto &varient = registries_[GetKernelOffset<Target, Precision, Layout>()];
@@ -153,6 +153,12 @@ class KernelRegistor : public lite::Registor<KernelType> {
 public:
  KernelRegistor(const std::string &op_type, const std::string &alias)
      : Registor<KernelType>([=] {
+<<<<<<< HEAD
+=======
+          VLOG(3) << "Register kernel " << op_type << " for "
+                  << TargetToStr(target) << " " << PrecisionToStr(precision)
+                  << " " << DataLayoutToStr(layout) << " alias " << alias;
+>>>>>>> gitlab/develop
          KernelRegistry::Global().Register<target, precision, layout>(
              op_type, [=]() -> std::unique_ptr<KernelType> {
                std::unique_ptr<KernelType> x(new KernelType);

--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -21,6 +21,7 @@
 * looks the same.
 */

+#include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/target_wrapper.h"


--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -9,12 +9,18 @@ cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
 cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
 cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)

 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
+lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
 lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
+lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
+lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)

 set(arm_kernels
    fc_compute_arm
@@ -22,6 +28,11 @@ set(arm_kernels
    mul_compute_arm
    scale_compute_arm
    softmax_compute_arm
-	elementwise_add_compute_arm)
+    conv_compute_arm
+    elementwise_add_compute_arm
+    pool_compute_arm
+    split_compute_arm
+    )

 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
+ 
--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ConvCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  int win = x_dims[3];  // nchw
+  int hin = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int ow = o_dims[3];
+  int oh = o_dims[2];
+  int oc = o_dims[1];
+  int kh = w_dims[2];  // oihw
+  int kw = w_dims[3];
+  int pad = param.paddings[0];
+  int stride = param.strides[0];
+
+  const auto* i_data = param.x->data<float>();
+  const auto* w_data = param.filter->data<float>();
+  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  auto* o_data = param.output->mutable_data<float>();
+
+  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
+                   (param.strides[0] == param.strides[1]) && (kw == kh);
+  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool flag_dw_3x3 =
+      (kw == 3 && (pad == 0 || pad == 1) && (stride == 1 || stride == 2));
+  bool flag_dw_5x5 =
+      (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
+  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+
+  // select conv impl
+  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+    // dw conv impl
+    impl_ = new lite::arm::math::DepthwiseConv<PRECISION(kFloat)>;
+    VLOG(3) << "invoking dw conv";
+  } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
+             no_dilation) {
+    if (ic >= 32 && oc >= 32 && oh > 16 && ow > 16) {
+      // winograd conv impl
+      impl_ = new lite::arm::math::WinogradConv<PRECISION(kFloat)>;
+      VLOG(3) << "invoking winograd conv";
+    } else {
+      // direct conv impl
+      impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
+      VLOG(3) << "invoking direct conv";
+    }
+  } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
+             no_dilation) {
+    // direct conv impl
+    impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
+    VLOG(3) << "invoking direct conv";
+  } else {
+    impl_ = new lite::arm::math::GemmLikeConv<PRECISION(kFloat)>;
+    VLOG(3) << "invoking gemm like conv";
+  }
+  CHECK(this->impl_->create(param, &ctx));
+}
+
+void ConvCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(impl_);
+  impl_->run(param);
+  // if (this->act_ != nullptr) {
+  //   this->act_->run(outputs, outputs, param.activation_param);
+  // }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ConvCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ConvCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/conv_compute.h
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~ConvCompute() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+  }
+
+ private:
+  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kFloat), param_t>* impl_{
+      nullptr};
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename dtype>
+void conv_compute_ref(const operators::ConvParam& param) {
+  auto input = param.x;
+  auto filter = param.filter;
+  auto output = param.output;
+  DDim input_dims = param.x->dims();
+  DDim filter_dims = param.filter->dims();
+  DDim output_dims = param.output->dims();
+  std::vector<int> paddings = param.paddings;
+  std::vector<int> strides = param.strides;
+  std::vector<int> dilations = param.dilations;
+  int groups = param.groups;
+
+  auto input_data = param.x->data<float>();
+  auto output_data = param.output->mutable_data<float>();
+  auto filter_data = param.filter->mutable_data<float>();
+  const float* bias_data = nullptr;
+  if (param.bias != nullptr) {
+    bias_data = param.bias->mutable_data<float>();
+  }
+  bool flag_bias = bias_data != nullptr;
+  bool flag_relu = false;  // TODO(hong19860320) param.relu
+
+  int num = input_dims[0];
+  int chout = output_dims[1];
+  int hout = output_dims[2];
+  int wout = output_dims[3];
+
+  int chin = input_dims[1];
+  int hin = input_dims[2];
+  int win = input_dims[3];
+  int out_c_group = chout / groups;
+  int in_c_group = chin / groups;
+
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
+  int padding_h = paddings[0];
+  int padding_w = paddings[1];
+  int kernel_h = filter_dims[2];
+  int kernel_w = filter_dims[3];
+
+  for (int n = 0; n < num; ++n) {
+    for (int g = 0; g < groups; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < hout; ++oh) {
+          for (int ow = 0; ow < wout; ++ow) {
+            int out_idx = n * groups * out_c_group * hout * wout +
+                          g * out_c_group * hout * wout + oc * hout * wout +
+                          oh * wout + ow;
+            output_data[out_idx] =
+                flag_bias ? static_cast<float>(bias_data[g * out_c_group + oc])
+                          : 0.f;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - padding_w + kw * (dilation_w);
+                  int ih = oh * stride_h - padding_h + kh * (dilation_h);
+                  if (iw < 0 || iw >= win) continue;
+                  if (ih < 0 || ih >= hin) continue;
+
+                  int iidx = n * chin * hin * win + g * in_c_group * hin * win +
+                             ic * hin * win + ih * win + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  output_data[out_idx] +=
+                      (dtype)input_data[iidx] * (dtype)filter_data[widx];
+                }
+              }
+            }
+            if (flag_relu) {
+              output_data[out_idx] =
+                  output_data[out_idx] > 0.f ? output_data[out_idx] : 0.f;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(conv_arm, retrive_op) {
+  auto conv = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+      "conv2d");
+  ASSERT_FALSE(conv.empty());
+  ASSERT_TRUE(conv.front());
+}
+
+TEST(conv_arm, init) {
+  ConvCompute conv;
+  ASSERT_EQ(conv.precision(), PRECISION(kFloat));
+  ASSERT_EQ(conv.target(), TARGET(kARM));
+}
+
+TEST(conv_arm, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto ic : {6, 32 /*, 128*/}) {
+      for (auto oc : {6, 32 /*, 128*/}) {
+        for (auto ih : {9, 18 /*, 56 , 112, 224, 512*/}) {
+          for (auto iw : {9, 18 /*, 56, 112, 224, 512*/}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, true}) {
+                  for (auto dilation : {1, 2}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {0, 1, 2}) {
+                        for (auto ks : {1, 3, 5}) {
+                          int group = 1;
+                          if (depthwise) {  // depthwise convolution ?
+                            group = oc = ic;
+                          }
+                          // get input, filter and output shape
+                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                          std::vector<int64_t> filter_shape = {oc, ic / group,
+                                                               ks, ks};
+                          const int dks = dilation * (ks - 1) + 1;
+                          int oh = (ih + 2 * padding - dks) / stride + 1;
+                          int ow = (iw + 2 * padding - dks) / stride + 1;
+                          std::vector<int64_t> output_shape({n, oc, oh, ow});
+                          // resize input, filter and output
+                          Tensor input;
+                          Tensor filter;
+                          Tensor bias;
+                          Tensor output;
+                          Tensor output_ref;
+                          input.Resize(input_shape);
+                          filter.Resize(filter_shape);
+                          output.Resize(output_shape);
+                          output_ref.Resize(output_shape);
+                          VLOG(3) << "input: " << input.dims();
+                          VLOG(3) << "filter: " << filter.dims()
+                                  << " padding:" << padding
+                                  << " stride:" << stride
+                                  << " dilation:" << dilation;
+                          VLOG(3) << "output: " << output.dims();
+                          auto* input_data = input.mutable_data<float>();
+                          auto* filter_data = filter.mutable_data<float>();
+                          auto* output_data = output.mutable_data<float>();
+                          for (int i = 0; i < input.dims().production(); i++) {
+                            input_data[i] = static_cast<float>(i % 128);
+                          }
+                          for (int i = 0; i < filter.dims().production(); i++) {
+                            filter_data[i] =
+                                i * 0.001f /
+                                static_cast<float>(filter.dims().production());
+                          }
+                          // prepare kernel params and run
+                          ConvCompute conv;
+                          std::unique_ptr<KernelContext> ctx(new KernelContext);
+                          ctx->As<ARMContext>();
+                          conv.SetContext(std::move(ctx));
+                          operators::ConvParam param;
+                          param.x = &input;
+                          param.filter = &filter;
+                          param.output = &output;
+                          param.bias = nullptr;
+                          if (flag_bias) {
+                            bias.Resize({oc});
+                            auto* bias_data = bias.mutable_data<float>();
+                            for (int i = 0; i < bias.dims().production(); i++) {
+                              bias_data[i] = static_cast<float>(i);
+                            }
+                            param.bias = &bias;
+                          }
+                          // TODO(hong19860320) param.relu = flag_relu;
+                          param.paddings = std::vector<int>({padding, padding});
+                          param.strides = std::vector<int>({stride, stride});
+                          param.dilations =
+                              std::vector<int>({dilation, dilation});
+                          param.groups = group;
+                          conv.SetParam(param);
+                          conv.Launch();
+                          // invoking ref implementation and compare results
+                          param.output = &output_ref;
+                          conv_compute_ref<float>(param);
+                          auto* output_ref_data =
+                              output_ref.mutable_data<float>();
+                          for (int i = 0; i < output.dims().production(); i++) {
+                            EXPECT_NEAR(output_data[i], output_ref_data[i],
+                                        1e-3);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
--- a/paddle/fluid/lite/kernels/arm/pool_compute.h
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.h
--- a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/split_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/split_compute.cc
--- a/paddle/fluid/lite/kernels/arm/split_compute.h
+++ b/paddle/fluid/lite/kernels/arm/split_compute.h
--- a/paddle/fluid/lite/kernels/arm/split_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/split_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ b/paddle/fluid/lite/kernels/arm/use_kernels.h
@@ -19,5 +19,6 @@ USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
--- a/paddle/fluid/lite/operators/conv_op.cc
+++ b/paddle/fluid/lite/operators/conv_op.cc
--- a/paddle/fluid/lite/operators/conv_op.h
+++ b/paddle/fluid/lite/operators/conv_op.h
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
--- a/paddle/fluid/lite/operators/pool_op.cc
+++ b/paddle/fluid/lite/operators/pool_op.cc
--- a/paddle/fluid/lite/operators/pool_op.h
+++ b/paddle/fluid/lite/operators/pool_op.h
--- a/paddle/fluid/lite/operators/pool_op_test.cc
+++ b/paddle/fluid/lite/operators/pool_op_test.cc
--- a/paddle/fluid/lite/operators/split_op.cc
+++ b/paddle/fluid/lite/operators/split_op.cc
--- a/paddle/fluid/lite/operators/split_op.h
+++ b/paddle/fluid/lite/operators/split_op.h
--- a/paddle/fluid/lite/utils/any.h
+++ b/paddle/fluid/lite/utils/any.h