make io_copy kernel pick works

621d1522 · superjomn · 1fb93746 · 621d1522 · 621d1522 · 621d1522
51 changed file
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -42,6 +42,7 @@ class OpDesc {
  void CopyFrom(const OpDesc &op_desc);

  proto::OpDesc *Proto();
+  const proto::OpDesc &ReadonlyProto() const { return desc_; }

  std::string Type() const { return desc_.type(); }


--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
-cc_library(cxx_api_lite SRCS cxx_api.cc DEPS scope_lite op_executor_lite host_kernels ops_lite optimizer_lite target_wrapper_host)
-
+cc_library(cxx_api_lite SRCS cxx_api.cc DEPS scope_lite op_executor_lite host_kernels ops_lite optimizer_lite target_wrapper_host )
 cc_test(test_cxx_api_lite SRCS cxx_api_test.cc DEPS cxx_api_lite model_parser_lite)
+
+if(LITE_WITH_CUDA)
+    cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS scope_lite op_executor_lite host_kernels ops_lite optimizer_lite target_wrapper_host target_wrapper_cuda kernels_cuda)
+    nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda model_parser_lite)
+endif()
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -29,7 +29,7 @@ class Predictor {
 public:
  Predictor() { scope_ = std::make_shared<Scope>(); }

-  void Build(const std::string& model_path,
+  void Build(const std::string& model_path, const Place& prefer_place,
             const std::vector<Place>& valid_places) {
    framework::proto::ProgramDesc prog;
    LoadModel(model_path, scope_.get(), &prog);
@@ -38,6 +38,7 @@ class Predictor {
    Program program(prog_desc, scope_, valid_places);

    Optimizer optimizer;
+    optimizer.KernelPickPreferPlace(prefer_place);
    core::KernelPickFactor factor;
    factor.ConsiderTarget();
    optimizer.Run(std::move(program), valid_places, factor);

--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -23,8 +23,21 @@ namespace lite {

 TEST(CXXApi, test) {
  lite::Predictor predictor;
+#ifndef LITE_WITH_CUDA
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)}});
+#else
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
+  });
+#endif
+
  predictor.Build("/home/chunwei/project2/models/model2",
-                  {Place{TARGET(kHost), PRECISION(kFloat)}});
+                  Place{TARGET(kCUDA), PRECISION(kFloat)}, valid_places);

  auto* input_tensor = predictor.GetInput(0);
  input_tensor->Resize({100, 100});
@@ -54,8 +67,15 @@ USE_LITE_OP(fc);
 USE_LITE_OP(scale);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
-USE_LITE_KERNEL(fc, kHost, kFloat, def);
-USE_LITE_KERNEL(mul, kHost, kFloat, def);
-USE_LITE_KERNEL(scale, kHost, kFloat, def);
-USE_LITE_KERNEL(feed, kHost, kFloat, def);
-USE_LITE_KERNEL(fetch, kHost, kFloat, def);
+USE_LITE_OP(io_copy);
+USE_LITE_KERNEL(fc, kHost, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kHost, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kHost, kFloat, kNCHW, def);
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+
+#ifdef LITE_WITH_CUDA
+USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+#endif
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -27,5 +27,6 @@ cc_test(test_tensor_lite SRCS tensor_test.cc)
 cc_test(test_op_executor_lite SRCS op_executor_test.cc DEPS op_executor_lite ops_lite host_kernels)
 cc_test(test_type_system SRCS type_system_test.cc DEPS type_system)
 cc_test(test_optimizer_lite SRCS optimizer_test.cc DEPS mir_pass_manager program_fake_utils mir_passes)
+cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)

 add_subdirectory(mir)
--- a/paddle/fluid/lite/core/kernel.cc
+++ b/paddle/fluid/lite/core/kernel.cc
@@ -17,6 +17,13 @@
 namespace paddle {
 namespace lite {

+std::string KernelBase::summary() const {
+  std::stringstream ss;
+  ss << op_type() << ":" << TargetToStr(target()) << "/"
+     << PrecisionToStr(precision()) << "/" << DataLayoutToStr(layout());
+  return ss.str();
+}
+
 bool ParamTypeRegistry::KeyCmp::operator()(
    const ParamTypeRegistry::key_t &a,
    const ParamTypeRegistry::key_t &b) const {

--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -17,6 +17,7 @@
 #include <map>
 #include <memory>
 #include <set>
+#include <sstream>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
@@ -34,49 +35,100 @@ namespace lite {
 // different targets.
 class KernelBase {
 public:
+  // type_infer_handler is used to inference a output type by considering the
+  // input types in the type system.
+  using type_infer_handler_t = std::function<const Type*(
+      const std::map<std::string, const Type*>& input_types,
+      const std::string& out_arg)>;
+
  virtual void Run() = 0;

  void SetContext(std::unique_ptr<KernelContext>&& ctx) {
    context_ = std::move(ctx);
  }
-
  template <typename T>
  void SetParam(T param) {
    param_.set<T>(param);
  }
-
  template <typename P>
  P& Param() const {
    return param_.get<P>();
  }

+  // This is used in the kernels that takes 'kAny' places and inference the
+  // output place. For `ScaleCompute` and `IoCopyCompute`, their input types are
+  // declared as 'kAny' in some Place field, and the output is also `kAny`, but
+  // when in real execution, when takes some non-kAny type as input, the
+  // output's kAny-fields can be determained. For example, when the
+  // `ScaleCompute` takes `TensorFp32NCHWTy` as input, its output should be also
+  // `TensorFp32NCHWTy`. This type inference rule is different for each kernel,
+  // so we make it a virtual method.
+  // One can custom this handler to make a specific type inference rule for a
+  // kernel, or leave the default to force the kernel use the system's
+  // type-inference rules.
+  virtual std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() {
+    return nullptr;
+  }
+
  void set_op_type(const std::string& type) { op_type_ = type; }
  const std::string& op_type() const { return op_type_; }

  void Torch() {}

+  // Get input declaration type.
+  const Type* GetInputDeclType(const std::string& arg_name) {
+    CHECK(!op_type_.empty()) << "op_type should be set first";
+    const auto* type = ParamTypeRegistry::Global().RetrieveInArgument(
+        place(), GenParamTypeKey(), arg_name);
+    CHECK(type) << "no type registered for kernel [" << op_type_
+                << "] input argument [" << arg_name << "]"
+                << " with key " << GenParamTypeKey();
+    return type->type;
+  }
+
+  // Get output declaration type.
+  const Type* GetOutputDeclType(const std::string& arg_name) {
+    CHECK(!op_type_.empty()) << "op_type should be set first";
+    const auto* type = ParamTypeRegistry::Global().RetrieveOutArgument(
+        place(), GenParamTypeKey(), arg_name);
+    CHECK(type) << "no type registered for kernel [" << op_type_
+                << "] output argument [" << arg_name << "]";
+    return type->type;
+  }
+
+  void set_alias(const std::string& x) {
+    alias_ = x;
+    LOG(INFO) << "kernel " << op_type() << " setting alias " << alias();
+  }
+  const std::string& alias() const { return alias_; }
+
  virtual Place place() const = 0;
  virtual TargetType target() const = 0;
  virtual PrecisionType precision() const = 0;
  virtual DataLayoutType layout() const = 0;
  const KernelContext* context() const { return context_.get(); }
-
  virtual std::string name() const = 0;

-  virtual ~KernelBase() = default;
+  // Short human-readable document.
+  std::string summary() const;
+  // Long human-readable document.
+  virtual std::string doc() const { return ""; }

-  std::string DebugString() const {
+  std::string GenParamTypeKey() const {
    std::stringstream ss;
-    ss << op_type() << ":" << TargetToStr(target()) << "/"
-       << PrecisionToStr(precision()) << "/" << DataLayoutToStr(layout());
+    LOG(INFO) << "alias : " << alias_;
+    ss << op_type() << "/" << alias_;
    return ss.str();
  }

+  virtual ~KernelBase() = default;
+
 protected:
  std::unique_ptr<KernelContext> context_;
  mutable operators::param_t param_;
  // The corresponding op type.
-  std::string op_type_;
+  std::string op_type_{};
+  std::string alias_{};
 };

 // Light-weight kernel implementation.

--- a/paddle/fluid/lite/core/memory.cc
+++ b/paddle/fluid/lite/core/memory.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "memory.h"
+#include "paddle/fluid/lite/core/memory.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/lite/core/memory.h
+++ b/paddle/fluid/lite/core/memory.h
@@ -14,7 +14,7 @@

 #pragma once
 #include <glog/logging.h>
-#include "target_wrapper.h"
+#include "paddle/fluid/lite/core/target_wrapper.h"

 namespace paddle {
 namespace lite {
@@ -26,9 +26,12 @@ static void* TargetMalloc(TargetType target, size_t size) {
    case TargetType::kX86:
      data = TargetWrapper<TARGET(kHost)>::Malloc(size);
      break;
+#ifdef LITE_WITH_CUDA
    case TargetType::kCUDA:
-      data = TargetWrapper<TARGET(kCUDA)>::Malloc(size);
+      data =
+          TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t>::Malloc(size);
      break;
+#endif  // LITE_WITH_CUDA
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }

--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -7,9 +7,12 @@ cc_library(mir_passes
        SRCS static_kernel_pick_pass.cc
        variable_place_inference_pass.cc
        io_complement_pass.cc
+        io_copy_kernel_pick_pass.cc
        graph_visualize_pass.cc
        generate_program_pass.cc
+        argument_type_display_pass.cc
        demo_pass.cc
+        runtime_context_assign_pass.cc
        DEPS mir_pass types_lite)

 cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)

--- a/paddle/fluid/lite/core/mir/argument_type_display_pass.cc
+++ b/paddle/fluid/lite/core/mir/argument_type_display_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ArgumentTypeDisplayPass : public DebugPass {
+ public:
+  void Apply(std::unique_ptr<mir::SSAGraph>& graph) override {
+    LOG(INFO) << "== Argument types ==";
+    for (auto& node : graph->mutable_nodes()) {
+      if (!node.IsArgument()) continue;
+
+      auto* type = node.AsArgument().type;
+      if (type) {
+        LOG(INFO) << "* ARG " << node.AsArgument().name << " type: " << *type;
+      } else {
+        LOG(INFO) << "* ARG " << node.AsArgument().name << " type: UNK";
+      }
+    }
+    LOG(INFO) << "---------------------";
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(argument_type_display_pass,
+                  paddle::lite::mir::ArgumentTypeDisplayPass);
--- a/paddle/fluid/lite/core/mir/generate_program_pass.cc
+++ b/paddle/fluid/lite/core/mir/generate_program_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/mir/generate_program_pass.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_registry.h"

 namespace paddle {
@@ -20,9 +21,11 @@ namespace lite {
 namespace mir {

 void GenerateProgramPass::Apply(std::unique_ptr<mir::SSAGraph>& graph) {
+  LOG(INFO) << "final program \n" << Visualize(graph.get());
  for (auto& item : graph->InstructTopologicalOrder()) {
    if (item->IsInstruct()) {
      auto& instruct = item->AsInstruct();
+      LOG(INFO) << instruct;
      insts_.emplace_back(instruct.op,
                          std::move(instruct.valid_kernels.front()));
    }

--- a/paddle/fluid/lite/core/mir/io_complement_pass.cc
+++ b/paddle/fluid/lite/core/mir/io_complement_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/mir/io_complement_pass.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_registry.h"

 namespace paddle {
@@ -21,28 +22,161 @@ namespace mir {

 void IoComplementPass::Apply(std::unique_ptr<mir::SSAGraph>& graph) {
  // Start from inputs of the graph, those should have place set.
+  std::list<Node*> nodes;
  for (auto& node : graph->mutable_nodes()) {
-    if (!node.IsInstruct()) continue;
-    auto& inst = node.AsInstruct();
-
-    // inputs
-    for (auto* in : node.inlinks) {
-      CHECK(in->IsArgument());
-      auto name = in->AsArgument().name;
-      std::string tmp;
-      CHECK(inst.op_info->GetInputArgname(name, &tmp));
-      auto type =
-          ParamTypeRegistry::Global().Retrieve<ParamTypeRegistry::IO::kInput>(
-              inst.place, inst.op_type, tmp);
-      CHECK(type) << "no param type found for " << inst.op_type << ":" << name
-                  << " " << inst.place;
-      CHECK(type->type);
-      CHECK(in->AsArgument().type);
-      if (!TypeCompatible(*type->type, *in->AsArgument().type)) {
-        LOG(INFO) << "found IO unmatched tensor: " << in->AsArgument().name;
+    nodes.push_back(&node);
+  }
+
+  CHECK(!valid_places_.empty());
+
+  for (auto& node : nodes) {
+    if (!node->IsInstruct()) continue;
+    auto inlinks = node->inlinks;
+    for (auto* in : inlinks) {
+      ComplementInputs(graph.get(), node, in);
+    }
+  }
+
+  // PickIoCopyKernel(graph.get());
+
+  LOG(INFO) << "\n" << Visualize(graph.get());
+}
+
+void IoComplementPass::ComplementInputs(SSAGraph* graph, Node* inst_node,
+                                        Node* in) {
+  // If this input is out of date.
+  if (inst_node->inlinks.end() ==
+      std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
+    return;
+
+  CHECK(inst_node->IsInstruct());
+  auto& inst = inst_node->AsInstruct();
+  CHECK(in->IsRoleSet());
+  CHECK(in->IsArgument());
+  auto in_arg_name = in->AsArgument().name;
+  std::string tmp;
+  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
+  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
+  CHECK(in->AsArgument().type);
+  if (!TypeCompatibleTo(*in->AsArgument().type, *decl_arg_type)) {
+    LOG(INFO) << "found IO unmatched tensor: " << in->AsArgument().name
+              << " for kernel " << inst.op->DebugString() << " "
+              << *in->AsArgument().type << " -> " << *decl_arg_type;
+    // Add an IoCopy instruction to make the input compatible with other dist.
+    AddIoCopyInst(*in->AsArgument().type, *decl_arg_type, in->AsArgument().name,
+                  graph, inst_node, valid_places_);
+  }
+}
+
+void UpdateOpdescInputName(framework::OpDesc* desc,
+                           const std::string& old_arg_name,
+                           const std::string& new_arg_name) {
+  for (auto& item : *desc->Proto()->mutable_inputs()) {
+    for (int i = 0; i < item.mutable_arguments()->size(); i++) {
+      auto* arg = item.mutable_arguments(i);
+      if (*arg == old_arg_name) {
+        *arg = new_arg_name;
+      }
+    }
+  }
+}
+
+void IoComplementPass::AddIoCopyInst(const Type& from, const Type& to,
+                                     const std::string& var, SSAGraph* graph,
+                                     Node* inst_node,
+                                     const std::vector<Place>& valid_places) {
+  CHECK(!valid_places.empty()) << "valid_place should be set";
+  // var -> new_transform_op -> new_var -> inst
+  // So there will be a new Argument node and a new IoCopy Instruct Node.
+
+  auto node_id = [&] { return graph->nodes().size(); };
+  auto io_copy_output_name = var + "/trans/" + std::to_string(node_id());
+  auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
+  auto* io_copy_inst = graph->NewInstructNode();
+
+  // create Op and kernels.
+  auto io_copy_op = LiteOpRegistry::Global().Create("io_copy");
+  // CHECK(io_copy_op);
+  // Create the new var manually.
+  inst_node->AsInstruct().op->scope()->Var(io_copy_output_name);
+
+  // Create IoCopy Instruction.
+  framework::OpDesc op_desc;
+  op_desc.SetType("io_copy");
+  op_desc.SetInput("Input", {var});
+  op_desc.SetOutput("Out", {io_copy_output_name});
+  op_desc.Flush();
+
+  io_copy_op->Attach(op_desc, inst_node->AsInstruct().op->scope());
+  auto kernels = io_copy_op->CreateKernels(valid_places);
+  io_copy_inst->AsInstruct("io_copy", std::move(kernels), io_copy_op);
+
+  // Remove the old link
+  RemoveDirectedLink(graph->Argument(var), inst_node);
+
+  // Update the original instruction OpDesc.
+  // Update its input to the io_copy_output_name
+  auto& inst = inst_node->AsInstruct();
+  auto inst_program_desc = inst.op_info()->desc();
+
+  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+  DirectedLink(graph->Argument(var), io_copy_inst);
+  DirectedLink(io_copy_inst, io_copy_output_arg);
+  DirectedLink(io_copy_output_arg, inst_node);
+
+  // reset opdesc and update kernel information
+  auto desc_dummy = inst_node->AsInstruct().op->op_info()->desc();
+  UpdateInputTo(&desc_dummy, var, io_copy_output_name);
+
+  framework::OpDesc desc_fake(desc_dummy, nullptr);
+  inst_node->AsInstruct().op->Attach(desc_fake,
+                                     inst_node->AsInstruct().op->scope());
+
+  std::string tmp;
+  if (inst_node->AsInstruct().op_info()->GetInputArgname("a", &tmp)) {
+    CHECK(false) << "get old a " << tmp;
+  }
+
+  for (auto& kernel : inst_node->AsInstruct().valid_kernels) {
+    inst_node->AsInstruct().op->AttachKernel(kernel.get());
+  }
+
+  graph->CheckValid();
+}
+
+void IoComplementPass::PickIoCopyKernel(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (node.IsInstruct() && node.AsInstruct().op_type == "io_copy") {
+      auto& kernels = node.AsInstruct().valid_kernels;
+      CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op";
+      for (auto& kernel : kernels) {
+        CHECK_EQ(node.inlinks.size(), 1UL);
+        CHECK_EQ(node.outlinks.size(), 1UL);
+        auto* inty = node.inlinks.front()->AsArgument().type;
+        auto* outy = node.outlinks.front()->AsArgument().type;
+        const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+        if (TypeCompatibleTo(*inty, *in_arg_ty)) {
+          const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+          // Both the input and output type matches, remove other kernels
+          // directly.
+          if (out_arg_ty->target() == outy->target()) {
+            LOG(INFO) << "get a IOCopy kernel";
+            auto x = std::move(kernel);
+            kernels.clear();
+            kernels.emplace_back(std::move(x));
+            break;
+          }
+        }
      }
    }
  }
+
+  // Check the compatiblity.
+}
+
+void IoComplementPass::SetValidPlaces(const std::vector<Place>& valid_places) {
+  CHECK(!valid_places.empty());
+  valid_places_ = valid_places;
 }

 }  // namespace mir

--- a/paddle/fluid/lite/core/mir/io_complement_pass.h
+++ b/paddle/fluid/lite/core/mir/io_complement_pass.h
@@ -15,18 +15,47 @@
 #pragma once

 #include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/op_registry.h"

 namespace paddle {
 namespace lite {
 namespace mir {

+static void UpdateInputTo(framework::proto::OpDesc* desc,
+                          const std::string& from, const std::string& to) {
+  for (auto& item : *desc->mutable_inputs()) {
+    for (auto& input : *item.mutable_arguments()) {
+      if (input == from) {
+        LOG(INFO) << "** update input argument from " << from << " to " << to;
+        input = to;
+      }
+    }
+  }
+}
+
 /*
 * IoComplementPass complement the necessary instruction to make data
 * transferring or transformation between different places.
 */
 class IoComplementPass : public ProgramPass {
 public:
-  void Apply(std::unique_ptr<mir::SSAGraph> &graph) override;
+  void Apply(std::unique_ptr<mir::SSAGraph>& graph) override;
+
+  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+
+  void AddIoCopyInst(const Type& from, const Type& to, const std::string& var,
+                     SSAGraph* graph, Node* inst_node,
+                     const std::vector<Place>& valid_places);
+
+  void SetValidPlaces(const std::vector<Place>& valid_places);
+
+  // Pick the right kernel of IoCopy considering the input and output Type.
+  void PickIoCopyKernel(SSAGraph* graph);
+
+  const std::vector<Place>& valid_places() const { return valid_places_; };
+
+ private:
+  std::vector<Place> valid_places_;
 };

 }  // namespace mir

--- a/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class IoCopyKernelPickPass : public InstructionPass {
+ public:
+  void Apply(std::unique_ptr<mir::SSAGraph>& graph) override {
+    for (auto& node : graph->mutable_nodes()) {
+      if (!node.IsInstruct()) continue;
+      auto& inst = node.AsInstruct();
+      if (inst.op_type != "io_copy") continue;
+
+      LOG(INFO) << "....> picking a IO COPY kernel";
+
+      auto& kernels = node.AsInstruct().valid_kernels;
+      CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op";
+      const auto* inty = node.inlinks.front()->AsArgument().type;
+      const auto* outy = node.outlinks.front()->AsArgument().type;
+      LOG(INFO) << "input type " << *inty;
+      LOG(INFO) << "output type " << *outy;
+
+      bool is_found = false;
+      LOG(INFO) << "kernels size " << kernels.size();
+      for (auto& kernel : kernels) {
+        CHECK_EQ(node.inlinks.size(), 1UL);
+        CHECK_EQ(node.outlinks.size(), 1UL);
+
+        const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+        const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+        LOG(INFO) << "checking kernel candidate " << *in_arg_ty << "->"
+                  << *out_arg_ty;
+        if (inty->target() == in_arg_ty->target()) {
+          // Both the input and output type matches, remove other kernels
+          // directly.
+          if (out_arg_ty->target() == outy->target()) {
+            LOG(INFO) << "get a IOCopy kernel";
+            auto x = std::move(kernel);
+            kernels.clear();
+            kernels.emplace_back(std::move(x));
+            is_found = true;
+            break;
+          }
+        }
+      }
+
+      CHECK(is_found) << "Can't find a IoCopy kernel for IO: " << *inty << "->"
+                      << *outy;
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(io_copy_kernel_pick_pass,
+                  paddle::lite::mir::IoCopyKernelPickPass);
--- a/paddle/fluid/lite/core/mir/node.h
+++ b/paddle/fluid/lite/core/mir/node.h
@@ -34,30 +34,43 @@ class Node {
  Node() = default;

  enum class Role {
-    kUnk = -1,
-    kArgument,
+    kArgument = 0,
    kInstruct,
-    kNumRoles /*should be last*/
+    kNumRoles, /*should be last*/
+    kUnk,
  };

  struct Instruct {
    std::string op_type;
-    Place place;
    // The kernel instances this Instruct contains.
    std::vector<std::unique_ptr<KernelBase>> valid_kernels;
-    std::shared_ptr<OpInfo> op_info;
    // TODO(Superjomn) make this a shared_ptr for resource safety.
    std::shared_ptr<OpLite> op;  // we hold op to run InferShape

+    const OpInfo* op_info() {
+      CHECK(op);
+      return op->op_info();
+    }
+
+    Place place() const {
+      CHECK(!valid_kernels.empty());
+      return valid_kernels.front()->place();
+    }
+
    KernelBase& picked_kernel() {
      CHECK(!valid_kernels.empty());
      return *valid_kernels.front();
    }
+
+    friend std::ostream& operator<<(std::ostream& os, const Instruct& other) {
+      os << "Instruct " << other.op_type << " " << other.place();
+      return os;
+    }
  };

  struct Argument {
    std::string name;
-    const Type* type;
+    const Type* type{};
    // Weight is a special kind of argument, it is marked as weight explicitly
    // so that some weight related optimization can take place.
    bool is_weight{false};
@@ -71,13 +84,11 @@ class Node {

  Instruct& AsInstruct(const std::string& op_type,
                       std::vector<std::unique_ptr<KernelBase>>&& kernels,
-                       const std::shared_ptr<OpLite>& op,
-                       const std::shared_ptr<lite::OpInfo>& op_info) {
+                       const std::shared_ptr<OpLite>& op) {
    auto& x = AsInstruct();
    x.op_type = op_type;
    x.op = op;
    x.valid_kernels = std::move(kernels);
-    x.op_info = op_info;
    return x;
  }

@@ -100,8 +111,25 @@ class Node {
    instruct_.reset(new Instruct);
    return *instruct_;
  }
+
+  friend std::ostream& operator<<(std::ostream& os, Node& other) {
+    os << static_cast<int>(other.role_) << " ";
+    if (!other.IsRoleSet()) {
+      os << "Unk role node";
+    }
+    if (other.IsArgument()) {
+      auto& arg = other.AsArgument();
+      os << "Argument " << arg.name;
+    }
+    if (other.IsInstruct()) {
+      auto& arg = other.AsInstruct();
+      os << "Instruct " << arg.op_type;
+    }
+    return os;
+  }
+
  // Check roles.
-  bool IsRoleSet() const { return role_ == Role::kUnk; }
+  bool IsRoleSet() const { return role_ != Role::kUnk; }
  bool IsInstruct() const { return role_ == Role::kInstruct; }
  bool IsArgument() const { return role_ == Role::kArgument; }


--- a/paddle/fluid/lite/core/mir/passes.h
+++ b/paddle/fluid/lite/core/mir/passes.h
@@ -26,3 +26,5 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(io_complement_pass);
 USE_MIR_PASS(generate_program_pass);
+USE_MIR_PASS(io_copy_kernel_pick_pass);
+USE_MIR_PASS(argument_type_display_pass);
--- a/paddle/fluid/lite/core/mir/ssa_graph.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph.cc
@@ -89,6 +89,144 @@ std::vector<mir::Node *> SSAGraph::InstructTopologicalOrder() {
  return res;
 }

+void SSAGraph::GraphCreateTmpVarNodes(const Program &program) {
+  for (const auto &name : program.tmp_vars) {
+    LOG(INFO) << "create arg node " << name;
+    node_storage_.emplace_back();
+    auto &new_node = node_storage_.back();
+    new_node.AsArgument(name);
+    arguments_[name] = &new_node;
+  }
+}
+
+void SSAGraph::GraphCreateWeightVarNodes(const Program &program) {
+  // create weight nodes.
+  for (const auto &name : program.weights) {
+    LOG(INFO) << "create arg node " << name;
+    node_storage_.emplace_back();
+    auto &new_node = node_storage_.back();
+    new_node.AsArgument(name);
+    arguments_[name] = &new_node;
+  }
+}
+
+Node *SSAGraph::GraphCreateInstructNode(
+    const Program &program, const std::shared_ptr<OpLite> &op,
+    const std::vector<Place> &valid_places) {
+  node_storage_.emplace_back();
+  // TODO(Superjomn) remove one valid_places here.
+  op->SetValidPlaces(valid_places);
+  auto &new_node = node_storage_.back();
+  auto kernels = op->CreateKernels(valid_places);
+  node_storage_.back().AsInstruct(op->op_type_, std::move(kernels), op);
+
+  CHECK(new_node.inlinks.empty()) << "duplicate Build found";
+  CHECK(new_node.outlinks.empty()) << "duplicate Build found";
+  return &node_storage_.back();
+}
+
+void SSAGraph::Build(const Program &program,
+                     const std::vector<Place> &valid_places) {
+  CHECK(node_storage_.empty());
+  GraphCreateTmpVarNodes(program);
+  GraphCreateWeightVarNodes(program);
+  CHECK(CheckNodesRoleSet());
+
+  for (auto &op : program.ops) {
+    auto *op_node = GraphCreateInstructNode(program, op, valid_places);
+    LOG(INFO) << "checking op " << op->op_type_;
+    for (const std::string &name : op->op_info()->input_names()) {
+      auto *arg = Argument(name);
+      LOG(INFO) << "input " << name;
+      CHECK(arg->IsRoleSet());
+      DirectedLink(arg, op_node);
+    }
+    for (const std::string &name : op->op_info()->output_names()) {
+      if (!arguments_.count(name)) {
+        NewArgumentNode(name);
+      }
+      LOG(INFO) << "output " << name;
+      auto *arg = arguments_.at(name);
+      CHECK(arg->IsRoleSet());
+      DirectedLink(op_node, arg);
+    }
+    CHECK(CheckLinksRoleSet());
+  }
+
+  MarkArgumentWeights(program);
+  CheckValid();
+}
+
+mir::Node *SSAGraph::Argument(const std::string &name) {
+  auto it = arguments_.find(name);
+  CHECK(it != arguments_.end()) << "no argument called " << name;
+  return it->second;
+}
+
+std::vector<mir::Node *> SSAGraph::inputs() {
+  std::vector<mir::Node *> res;
+  for (auto &node : node_storage_) {
+    if (node.inlinks.empty()) {
+      res.push_back(&node);
+    }
+  }
+  return res;
+}
+
+std::vector<mir::Node *> SSAGraph::outputs() {
+  std::vector<mir::Node *> res;
+  for (auto &node : node_storage_) {
+    if (node.outlinks.empty()) {
+      res.push_back(&node);
+    }
+  }
+  return res;
+}
+
+mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
+  auto it = arguments_.find(arg);
+  if (it != arguments_.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+bool SSAGraph::CheckNodesRoleSet() {
+  for (auto &node : mutable_nodes()) {
+    CHECK_OR_FALSE(node.IsRoleSet());
+  }
+  return true;
+}
+
+bool SSAGraph::CheckLinksRoleSet() {
+  for (auto &node : mutable_nodes()) {
+    CHECK_OR_FALSE(node.IsRoleSet());
+    if (!node.IsInstruct()) continue;
+    for (auto *x : node.inlinks) {
+      CHECK_OR_FALSE(x->IsRoleSet());
+      CHECK_OR_FALSE(x->IsArgument());
+    }
+    for (auto *x : node.outlinks) {
+      CHECK_OR_FALSE(x->IsRoleSet());
+      CHECK_OR_FALSE(x->IsArgument());
+    }
+  }
+  return true;
+}
+
+Node *SSAGraph::NewArgumentNode(const std::string &name) {
+  node_storage_.emplace_back();
+  CHECK(!arguments_.count(name)) << "duplicate argument called " << name;
+  arguments_[name] = &node_storage_.back();
+  node_storage_.back().AsArgument(name);
+  return &node_storage_.back();
+}
+
+Node *SSAGraph::NewInstructNode() {
+  node_storage_.emplace_back();
+  return &node_storage_.back();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/mir/ssa_graph.h
+++ b/paddle/fluid/lite/core/mir/ssa_graph.h
@@ -35,104 +35,44 @@ class SSAGraph : GraphBase {
 public:
  // @param program: the op program
  // @param valid_places: the valid places user set for the system.
-  void Build(const Program &program, const std::vector<Place> &valid_places) {
-    // create temporary nodes.
-    for (const auto &name : program.tmp_vars) {
-      node_storage_.emplace_back();
-      auto &new_node = node_storage_.back();
-      auto &arg = new_node.AsArgument();
-      arg.name = name;
-      arguments_[name] = &new_node;
-    }
-
-    // create weight nodes.
-    for (const auto &name : program.weights) {
-      node_storage_.emplace_back();
-      auto &new_node = node_storage_.back();
-      auto &arg = new_node.AsArgument();
-      arg.name = name;
-      arguments_[name] = &new_node;
-    }
+  void Build(const Program &program, const std::vector<Place> &valid_places);

-    for (auto &op : program.ops) {
-      node_storage_.emplace_back();
-      // TODO(Superjomn) remove one valid_places here.
-      op->SetValidPlaces(valid_places);
-      auto &new_node = node_storage_.back();
-      auto kernels = op->CreateKernels(valid_places);
-      node_storage_.back().AsInstruct(op->op_type_, std::move(kernels), op,
-                                      op->op_info());
-
-      CHECK(new_node.inlinks.empty()) << "duplicate Build found";
-      CHECK(new_node.outlinks.empty()) << "duplicate Build found";
-
-      // collect inputs and outputs
-      for (const std::string &name : op->op_info()->input_names()) {
-        auto *arg = Argument(name);
-        new_node.inlinks.push_back(arg);
-        arg->outlinks.push_back(&new_node);
-      }
-      for (const std::string &name : op->op_info()->output_names()) {
-        if (!arguments_.count(name)) {
-          node_storage_.emplace_back();
-          auto &new_node = node_storage_.back();
-          auto &arg = new_node.AsArgument(name);
-          arg.name = name;
-          arguments_.emplace(name, &new_node);
-        }
-        auto *arg = arguments_.at(name);
-        new_node.outlinks.push_back(arg);
-        arg->inlinks.push_back(&new_node);
-      }
-    }
-
-    MarkArgumentWeights(program);
-  }
-
-  mir::Node *Argument(const std::string &name) {
-    auto it = arguments_.find(name);
-    CHECK(it != arguments_.end()) << "no argument called " << name;
-    return it->second;
-  }
+  mir::Node *Argument(const std::string &name);

  std::vector<mir::Node *> InstructTopologicalOrder();

  // The inputs of the graph.
-  std::vector<mir::Node *> inputs() {
-    std::vector<mir::Node *> res;
-    for (auto &node : node_storage_) {
-      if (node.inlinks.empty()) {
-        res.push_back(&node);
-      }
-    }
-    return res;
-  }
+  std::vector<mir::Node *> inputs();

  // The outputs of the graph.
-  std::vector<mir::Node *> outputs() {
-    std::vector<mir::Node *> res;
-    for (auto &node : node_storage_) {
-      if (node.outlinks.empty()) {
-        res.push_back(&node);
-      }
-    }
-    return res;
-  }
+  std::vector<mir::Node *> outputs();

  const std::list<mir::Node> &nodes() const { return node_storage_; }
  std::list<mir::Node> &mutable_nodes() { return node_storage_; }

-  mir::Node *RetrieveArgument(const std::string &arg) {
-    auto it = arguments_.find(arg);
-    if (it != arguments_.end()) {
-      return it->second;
-    }
-    return nullptr;
+  mir::Node *RetrieveArgument(const std::string &arg);
+
+  Node *NewArgumentNode(const std::string &name);
+  Node *NewInstructNode();
+
+  void CheckValid() {
+    CHECK(CheckBidirectionalConnection());
+    CHECK(CheckNodesRoleSet());
+    CHECK(CheckLinksRoleSet());
  }

 private:
+  void GraphCreateTmpVarNodes(const Program &program);
+  void GraphCreateWeightVarNodes(const Program &program);
+  Node *GraphCreateInstructNode(const Program &program,
+                                const std::shared_ptr<OpLite> &op,
+                                const std::vector<Place> &valid_places);
+
  // Check the bidirectional connection.
  bool CheckBidirectionalConnection();
+  bool CheckNodesRoleSet();
+  // Check all the items's role in inlinks and outlinks is set.
+  bool CheckLinksRoleSet();

  void MarkArgumentWeights(const Program &program) {
    for (const auto &name : program.weights) {
@@ -152,6 +92,48 @@ class SSAGraph : GraphBase {
  std::map<std::string, mir::Node *> arguments_;
 };

+// Remove the link between a -> b.
+static void RemoveDirectedLink(Node *a, Node *b) {
+  auto it = std::find(b->inlinks.begin(), b->inlinks.end(), a);
+  if (it != b->inlinks.end()) {
+    b->inlinks.erase(it);
+  }
+
+  auto it1 = std::find(a->outlinks.begin(), a->outlinks.end(), b);
+  if (it1 != a->outlinks.end()) {
+    a->outlinks.erase((it1));
+  }
+}
+
+// Link a -> b.
+static void DirectedLink(Node *a, Node *b) {
+  // Eagerly remove first, to avoid duplicate link.
+  RemoveDirectedLink(a, b);
+  a->outlinks.push_back(b);
+  b->inlinks.push_back(a);
+}
+
+static void LocalInferenceType(Node *a, Node *b, const std::string &arg_name) {
+  // instr -> output argument
+  if (a->IsInstruct() && b->IsArgument()) {
+    auto &inst = a->AsInstruct();
+    auto &output = b->AsArgument();
+
+    if (!output.type) {
+      output.type = inst.picked_kernel().GetOutputDeclType(arg_name);
+    }
+  }
+
+  // input argument -> instr
+  if (a->IsArgument() && b->IsInstruct()) {
+    auto &input = a->AsArgument();
+    auto &inst = b->AsInstruct();
+    if (!input.type) {
+      input.type = inst.picked_kernel().GetInputDeclType(arg_name);
+    }
+  }
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
@@ -37,7 +37,9 @@ void StaticKernelPickPass::Apply(std::unique_ptr<mir::SSAGraph>& graph) {
    auto& instruct = node.AsInstruct();
    std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
    for (auto&& kernel : instruct.valid_kernels) {
-      scored.emplace_back(KernelGrade(*kernel), std::move(kernel));
+      size_t score = KernelGrade(*kernel);
+      LOG(INFO) << "kernel " << kernel->summary() << " " << score;
+      scored.emplace_back(score, std::move(kernel));
    }

    std::sort(scored.begin(), scored.end(), KernelScoreCmp);
@@ -47,7 +49,6 @@ void StaticKernelPickPass::Apply(std::unique_ptr<mir::SSAGraph>& graph) {
    // TODO(Superjomn) reconsider this.
    instruct.valid_kernels.clear();
    instruct.valid_kernels.emplace_back(std::move(scored.front().second));
-    instruct.place = instruct.valid_kernels.front()->place();
    LOG(INFO) << "pick " << instruct.valid_kernels.front()->name();
  }
 }

--- a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.h
+++ b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.h
@@ -37,6 +37,7 @@ class StaticKernelPickPass : public mir::InstructionPass {
 public:
  void Apply(std::unique_ptr<mir::SSAGraph>& graph) override;

+  void SetPreferPlace(const Place& place) { place_ = place; }
  const Place& place() const { return place_; }
  const core::KernelPickFactor& kernel_pick_factors() const {
    return kernel_pick_factors_;
@@ -51,16 +52,32 @@ class StaticKernelPickPass : public mir::InstructionPass {
    size_t score{};
    const int kMax =
        std::numeric_limits<core::KernelPickFactor::value_type>::max();
+
+    // The more important factor comes first
    if (kernel_pick_factors_.IsTargetConsidered() &&
-        place().target == kernel.target()) {
+        (place().target == kernel.target() || kernel.target() == TARGET(kAny) ||
+         place().target == TARGET(kAny))) {
      score +=
          kMax / static_cast<int>(core::KernelPickFactor::Factor::TargetFirst);
    }
    if (kernel_pick_factors_.IsPrecisionConsidered() &&
-        place().precision == kernel.precision()) {
+        (place().precision == kernel.precision() ||
+         kernel.precision() == PRECISION(kAny) ||
+         place().precision == PRECISION(kAny))) {
      score += kMax /
               static_cast<int>(core::KernelPickFactor::Factor::PrecisionFirst);
    }
+    if (kernel_pick_factors_.IsDataLayoutConsidered() &&
+        (place().layout == kernel.layout() ||
+         kernel.layout() == DATALAYOUT(kAny) ||
+         place().layout == DATALAYOUT(kAny))) {
+      score += kMax / static_cast<int>(
+                          core::KernelPickFactor::Factor::DataLayoutFirst);
+    }
+    LOG(INFO) << "picker tactic " << kernel_pick_factors_;
+    LOG(INFO) << "kernel place " << kernel.place();
+    LOG(INFO) << "picker place " << place();
+    LOG(INFO) << "score " << score;

    // The data layout is not considered, for the input and output arguments
    // might have different data layout.

--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass.cc
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass.cc
@@ -22,8 +22,8 @@ namespace mir {

 void VariablePlaceInferencePass::Apply(std::unique_ptr<mir::SSAGraph>& graph) {
  MarkInputPlace(graph.get());
-
  InferenceArgumentPlace(graph.get());
+  CheckAllArgumentTypeDetermined(graph.get());
 }

 }  // namespace mir

--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
@@ -31,6 +31,7 @@ class VariablePlaceInferencePass : public DebugPass {
 private:
  // Mark the place of input arguments.
  void MarkInputPlace(SSAGraph* graph) {
+    CHECK(!graph->inputs().empty()) << "graph's inputs should be set";
    for (const auto& v : graph->inputs()) {
      // the feed op might in the inputs
      if (v->IsInstruct()) {
@@ -39,54 +40,60 @@ class VariablePlaceInferencePass : public DebugPass {
      }

      // auto& arg = v->AsArgument();
-      // arg.place.target = argument_default_target_;
+      // LOG(INFO) << "get graph input " << arg.name << " " << *arg.type;
+      // arg.type.target = argument_default_target_;
      // the other place description can't be determined yet, until their first
      // usage by some kernel.
    }
  }

+  void CheckAllArgumentTypeDetermined(SSAGraph* graph) {
+    for (auto& node : graph->mutable_nodes()) {
+      if (node.IsArgument()) {
+        CHECK(node.AsArgument().type) << "node " << node.AsArgument().name
+                                      << " type not determined";
+      }
+    }
+  }
+
  void InferenceArgumentPlace(SSAGraph* graph) {
    LOG(INFO) << "param-type-registry:\n" << ParamTypeRegistry::Global();
    for (auto& x : graph->InstructTopologicalOrder()) {
      auto& inst = x->AsInstruct();
-      CHECK(inst.place.is_valid())
-          << "kernel's place should be set when loaded";
+      // The IoCopyOp is a tool operator, it won't support the type inference.
+      if (inst.op_type == "io_copy") continue;
+      // LOG(INFO) << "- inferencing type " <<
      // deal with inputs
-      for (auto& arg_name : inst.op_info->input_argnames()) {
-        auto type =
-            ParamTypeRegistry::Global().Retrieve<ParamTypeRegistry::IO::kInput>(
-                inst.place, inst.op_type, arg_name);
-        CHECK(type) << "no param-type found for " << inst.op_type << ":"
-                    << arg_name << " " << inst.place.DebugString();
-        auto arg_names = inst.op_info->input_argument().at(arg_name);
+      for (auto& arg_name : inst.op_info()->input_argnames()) {
+        LOG(INFO) << "-- input arg_name " << arg_name;
        // check if inputs's place is set, if not set, update them with the
        // kernel's declaration.
+        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
+        auto arg_names = inst.op_info()->input_argument().at(arg_name);

        for (auto& arg_name : arg_names) {
+          LOG(INFO) << "--- var " << arg_name;
          auto* node = graph->RetrieveArgument(arg_name);
          CHECK(node) << "argument " << arg_name << " not exists in the graph";
          auto& arg_node = node->AsArgument();
          if (arg_node.type) continue;
-          arg_node.type = type->type;
+          arg_node.type = type;
        }
      }

-      for (auto& arg_name : inst.op_info->output_argnames()) {
-        auto type = ParamTypeRegistry::Global()
-                        .Retrieve<ParamTypeRegistry::IO::kOutput>(
-                            inst.place, inst.op_type, arg_name);
-        CHECK(type) << "no param-type found for " << inst.op_type << ":"
-                    << arg_name << " " << inst.place.DebugString();
-        auto arg_names = inst.op_info->output_argument().at(arg_name);
+      for (auto& arg_name : inst.op_info()->output_argnames()) {
+        LOG(INFO) << "-- output arg_name " << arg_name;
+        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
+        auto arg_names = inst.op_info()->output_argument().at(arg_name);
        // check if outputs's place is set, if not set, update them with the
        // kernel's declaration.
-
        for (auto& arg_name : arg_names) {
+          LOG(INFO) << "--- var " << arg_name;
          auto* node = graph->RetrieveArgument(arg_name);
          CHECK(node) << "argument " << arg_name << " not exists in the graph";
          auto& arg_node = node->AsArgument();
          if (arg_node.type) continue;
-          node->AsArgument().type = type->type;
+          node->AsArgument().type = type;
        }
      }
    }

--- a/paddle/fluid/lite/core/op_lite.cc
+++ b/paddle/fluid/lite/core/op_lite.cc
@@ -27,13 +27,15 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
  for (auto place : places) {
    auto ks = KernelRegistry::Global().Create(
        (kernel_type.empty() ? op_type_ : kernel_type), place.target,
-        place.precision);
+        place.precision, place.layout);
    for (auto &&it : ks) {
      AttachKernel(it.get());
      kernels.emplace_back(std::move(it));
    }
  }

+  CHECK(!kernels.empty()) << "No kernel found for Op " << op_type_;
+  LOG(INFO) << "op " << op_type_ << " get " << kernels.size() << " kernels";
  return kernels;
 }

@@ -59,9 +61,10 @@ bool OpLite::Run() {
 }

 bool OpLite::Attach(const framework::OpDesc &opdesc, lite::Scope *scope) {
-  CHECK(!op_info_) << "op_info duplicate build found";
-  op_info_ = std::make_shared<OpInfo>();
-  op_info_->Build(opdesc);
+  CHECK(scope);
+  scope_ = scope;
+  op_info_.reset(new OpInfo);  // Force clean the out-of-date infomation.
+  op_info_->Build(opdesc.ReadonlyProto());
  return AttachImpl(opdesc, scope);
 }

@@ -79,7 +82,8 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
  return var->GetMutable<lite::Tensor>();
 }

-bool OpInfo::GetInputArgname(const std::string &value_name, std::string *out) {
+bool OpInfo::GetInputArgname(const std::string &value_name,
+                             std::string *out) const {
  for (auto &item : input_argument_) {
    auto it = std::find(item.second.begin(), item.second.end(), value_name);
    if (it != item.second.end()) {
@@ -89,7 +93,8 @@ bool OpInfo::GetInputArgname(const std::string &value_name, std::string *out) {
  }
  return false;
 }
-bool OpInfo::GetOutputArgname(const std::string &value_name, std::string *out) {
+bool OpInfo::GetOutputArgname(const std::string &value_name,
+                              std::string *out) const {
  for (auto &item : output_argument_) {
    auto it = std::find(item.second.begin(), item.second.end(), value_name);
    if (it != item.second.end()) {

--- a/paddle/fluid/lite/core/op_lite.h
+++ b/paddle/fluid/lite/core/op_lite.h
@@ -81,19 +81,30 @@ class OpLite : public Registry {
  // Run this operator.
  virtual bool Run();

+  // Link the external execution environ to internal context.
  bool Attach(const framework::OpDesc &opdesc, lite::Scope *scope);

-  const std::shared_ptr<OpInfo> &op_info() const { return op_info_; }
-  std::shared_ptr<OpInfo> &mutable_op_info() { return op_info_; }
+  const OpInfo *op_info() const { return op_info_.get(); }
+  OpInfo *mutable_op_info() { return op_info_.get(); }

  // Human-readable information.
  virtual std::string DebugString() const = 0;

  const Place &kernel_place() const { return kernel_place_; }

+  // NOTE This might be discarded.
  void PickKernel(const std::vector<Place> &valid_places,
                  KernelStrategy kernel_strategy = KernelStrategy::kStatic);

+  // Create all the kernels for the valid targets.
+  std::vector<std::unique_ptr<KernelBase>> CreateKernels(
+      const std::vector<Place> &places, const std::string &kernel_type = "");
+
+  lite::Scope *scope() { return scope_; }
+
+  // Assign op param to kernel.
+  virtual void AttachKernel(KernelBase *kernel) = 0;
+
  virtual ~OpLite() = default;

 protected:
@@ -101,9 +112,6 @@ class OpLite : public Registry {
  virtual bool AttachImpl(const framework::OpDesc &opdesc,
                          lite::Scope *scope) = 0;

-  // Assign op param to kernel.
-  virtual void AttachKernel(KernelBase *kernel) = 0;
-
  // Specify the kernel to run by default. This will specify the value of
  // `kernel_place_`.
  virtual void StaticPickKernel(const std::vector<Place> &valid_targets) {
@@ -118,10 +126,6 @@ class OpLite : public Registry {
  // some inputs are ready.
  void RecordOutputEvents() {}

-  // Create all the kernels for the valid targets.
-  std::vector<std::unique_ptr<KernelBase>> CreateKernels(
-      const std::vector<Place> &places, const std::string &kernel_type = "");
-
  const Tensor *GetTensor(lite::Scope *scope, const std::string &name) const;
  Tensor *GetMutableTensor(lite::Scope *scope, const std::string &name) const;

@@ -129,11 +133,12 @@ class OpLite : public Registry {
  friend class mir::SSAGraph;

 protected:
+  lite::Scope *scope_{};
  std::unique_ptr<KernelBase> kernel_;
  std::string op_type_;
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
-  std::shared_ptr<OpInfo> op_info_;
+  std::unique_ptr<OpInfo> op_info_;
 };

 /*
@@ -142,22 +147,30 @@ class OpLite : public Registry {
 */
 class OpInfo {
 public:
-  void Build(const framework::OpDesc &desc) {
+  // To avoid the bugs from legancy framework::OpDesc, we use the ProtoBuf
+  // message instead.
+  void Build(const framework::proto::OpDesc &desc) {
    ExtractInputsAndOutputs(desc);
    CollectInputAndOutputArgnames(desc);
    CollectArguments(desc);
+    desc_.reset(new framework::proto::OpDesc(desc));
  }

+  const framework::proto::OpDesc &desc() const {
+    CHECK(desc_) << "desc has't set";
+    return *desc_;
+  }
+  framework::proto::OpDesc *mutable_desc() { return desc_.get(); }
  const std::list<std::string> &input_names() const { return input_names_; }
  const std::list<std::string> &output_names() const { return output_names_; }
-  const std::map<std::string, std::list<std::string>> &input_argument() {
+  const std::map<std::string, std::list<std::string>> &input_argument() const {
    return input_argument_;
  }
-  const std::map<std::string, std::list<std::string>> &output_argument() {
+  const std::map<std::string, std::list<std::string>> &output_argument() const {
    return output_argument_;
  }
-  bool GetInputArgname(const std::string &value_name, std::string *out);
-  bool GetOutputArgname(const std::string &value_name, std::string *out);
+  bool GetInputArgname(const std::string &value_name, std::string *out) const;
+  bool GetOutputArgname(const std::string &value_name, std::string *out) const;

  const std::list<std::string> &input_argnames() const {
    return input_argnames_;
@@ -167,37 +180,37 @@ class OpInfo {
  }

 private:
-  void ExtractInputsAndOutputs(const framework::OpDesc &opdesc) {
-    for (const auto &item : opdesc.Inputs()) {
-      for (const auto &x : item.second) {
+  void ExtractInputsAndOutputs(const framework::proto::OpDesc &opdesc) {
+    for (const auto &item : opdesc.inputs()) {
+      for (const auto &x : item.arguments()) {
        input_names_.push_back(x);
      }
    }
-    for (const auto &item : opdesc.Outputs()) {
-      for (const auto &x : item.second) {
+    for (const auto &item : opdesc.outputs()) {
+      for (const auto &x : item.arguments()) {
        output_names_.push_back(x);
      }
    }
  }

-  void CollectInputAndOutputArgnames(const framework::OpDesc &opdesc) {
-    for (const auto &item : opdesc.InputNames()) {
-      input_argnames_.push_back(item);
+  void CollectInputAndOutputArgnames(const framework::proto::OpDesc &opdesc) {
+    for (const auto &item : opdesc.inputs()) {
+      input_argnames_.push_back(item.parameter());
    }
-    for (const auto &item : opdesc.OutputNames()) {
-      output_argnames_.push_back(item);
+    for (const auto &item : opdesc.outputs()) {
+      output_argnames_.push_back(item.parameter());
    }
  }

-  void CollectArguments(const framework::OpDesc &opdesc) {
-    for (const auto &item : opdesc.Inputs()) {
-      for (auto &x : item.second) {
-        input_argument_[item.first].push_back(x);
+  void CollectArguments(const framework::proto::OpDesc &opdesc) {
+    for (const auto &item : opdesc.inputs()) {
+      for (auto &x : item.arguments()) {
+        input_argument_[item.parameter()].push_back(x);
      }
    }
-    for (const auto &item : opdesc.Outputs()) {
-      for (auto &x : item.second) {
-        output_argument_[item.first].push_back(x);
+    for (const auto &item : opdesc.outputs()) {
+      for (auto &x : item.arguments()) {
+        output_argument_[item.parameter()].push_back(x);
      }
    }
  }
@@ -209,6 +222,8 @@ class OpInfo {
  std::list<std::string> output_argnames_;
  std::map<std::string, std::list<std::string>> input_argument_;
  std::map<std::string, std::list<std::string>> output_argument_;
+  // NOTE too heavy.
+  std::unique_ptr<framework::proto::OpDesc> desc_;
 };

 }  // namespace lite

--- a/paddle/fluid/lite/core/op_registry.cc
+++ b/paddle/fluid/lite/core/op_registry.cc
@@ -18,13 +18,33 @@ namespace paddle {
 namespace lite {

 std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
-    const std::string &op_type, TargetType target, PrecisionType precision) {
-#define CREATE_KERNEL(target__)                                    \
-  switch (precision) {                                             \
-    case PRECISION(kFloat):                                        \
-      return Create<TARGET(target__), PRECISION(kFloat)>(op_type); \
-    default:                                                       \
-      CHECK(false) << "not supported kernel place yet";            \
+    const std::string &op_type, TargetType target, PrecisionType precision,
+    DataLayoutType layout) {
+  Place place{target, precision, layout};
+  LOG(INFO) << "creating " << op_type << " kernel for " << place;
+#define CREATE_KERNEL1(target__, precision__)                                \
+  switch (layout) {                                                          \
+    case DATALAYOUT(kNCHW):                                                  \
+      return Create<TARGET(target__), PRECISION(precision__),                \
+                    DATALAYOUT(kNCHW)>(op_type);                             \
+    case DATALAYOUT(kAny):                                                   \
+      return Create<TARGET(target__), PRECISION(precision__),                \
+                    DATALAYOUT(kAny)>(op_type);                              \
+    default:                                                                 \
+      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
+  }
+
+#define CREATE_KERNEL(target__)                         \
+  switch (precision) {                                  \
+    case PRECISION(kFloat):                             \
+      CREATE_KERNEL1(target__, kFloat);                 \
+    case PRECISION(kInt8):                              \
+      CREATE_KERNEL1(target__, kInt8);                  \
+    case PRECISION(kAny):                               \
+      CREATE_KERNEL1(target__, kAny);                   \
+    default:                                            \
+      CHECK(false) << "not supported kernel precision " \
+                   << PrecisionToStr(precision);        \
  }

  switch (target) {
@@ -38,7 +58,7 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
      CREATE_KERNEL(kCUDA);
    } break;
    default:
-      CHECK(false) << "not supported kernel place";
+      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }

 #undef CREATE_KERNEL
@@ -46,14 +66,21 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
 }

 KernelRegistry::KernelRegistry() {
-#define INIT_FOR(target__, precision__)                                      \
+#define INIT_FOR(target__, precision__, layout__)                            \
  registries_[KernelRegistry::GetKernelOffset<TARGET(target__),              \
-                                              PRECISION(precision__)>()]     \
-      .set<KernelRegistryForTarget<TARGET(target__), PRECISION(precision__)> \
-               *>(&KernelRegistryForTarget<TARGET(target__),                 \
-                                           PRECISION(precision__)>::Global());
+                                              PRECISION(precision__),        \
+                                              DATALAYOUT(layout__)>()]       \
+      .set<KernelRegistryForTarget<TARGET(target__), PRECISION(precision__), \
+                                   DATALAYOUT(layout__)> *>(                 \
+          &KernelRegistryForTarget<TARGET(target__), PRECISION(precision__), \
+                                   DATALAYOUT(layout__)>::Global());
  // Currently, just register 2 kernel targets.
-  INIT_FOR(kHost, kFloat);
+  INIT_FOR(kCUDA, kFloat, kNCHW);
+  INIT_FOR(kCUDA, kAny, kNCHW);
+  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kAny, kNCHW);
+  INIT_FOR(kHost, kAny, kAny);
+  INIT_FOR(kCUDA, kAny, kAny);
 #undef INIT_FOR
 }


--- a/paddle/fluid/lite/core/op_registry.h
+++ b/paddle/fluid/lite/core/op_registry.h
@@ -50,80 +50,108 @@ class OpLiteRegistor : public Registor<OpClass> {
        }) {}
 };

-template <TargetType Target, PrecisionType Precision>
+template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
 using KernelRegistryForTarget =
-    Factory<OpKernel<Target, Precision>, std::unique_ptr<KernelBase>>;
+    Factory<OpKernel<Target, Precision, Layout>, std::unique_ptr<KernelBase>>;

 class KernelRegistry final {
 public:
  using any_kernel_registor_t =
-      variant<KernelRegistryForTarget<TARGET(kCUDA), PRECISION(kFloat)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA), PRECISION(kInt8)> *,   //
-              KernelRegistryForTarget<TARGET(kX86), PRECISION(kFloat)> *,   //
-              KernelRegistryForTarget<TARGET(kX86), PRECISION(kInt8)> *,    //
-              KernelRegistryForTarget<TARGET(kHost), PRECISION(kFloat)> *   //
+      variant<KernelRegistryForTarget<TARGET(kCUDA), PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA), PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kX86), PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kX86), PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kHost), PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kHost), PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA), PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *  //
              >;

  KernelRegistry();

  static KernelRegistry &Global();

-  template <TargetType Target, PrecisionType Precision>
+  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
  void Register(const std::string &name,
-                typename KernelRegistryForTarget<Target, Precision>::creator_t
-                    &&creator) {
-    using kernel_registor_t = KernelRegistryForTarget<Target, Precision>;
-    registries_[GetKernelOffset<Target, Precision>()]
-        .template get<kernel_registor_t *>()
-        ->Register(name, std::move(creator));
+                typename KernelRegistryForTarget<Target, Precision,
+                                                 Layout>::creator_t &&creator) {
+    LOG(INFO) << "register for " << TargetToStr(Target) << ":"
+              << PrecisionToStr(Precision) << "//"
+              << GetKernelOffset<Target, Precision, Layout>();
+    using kernel_registor_t =
+        KernelRegistryForTarget<Target, Precision, Layout>;
+    auto &varient = registries_[GetKernelOffset<Target, Precision, Layout>()];
+    varient.template get<kernel_registor_t *>()->Register(name,
+                                                          std::move(creator));
  }

-  template <TargetType Target, PrecisionType Precision>
+  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type) {
-    using kernel_registor_t = KernelRegistryForTarget<Target, Precision>;
-    return registries_[GetKernelOffset<Target, Precision>()]
+    using kernel_registor_t =
+        KernelRegistryForTarget<Target, Precision, Layout>;
+    return registries_[GetKernelOffset<Target, Precision, Layout>()]
        .template get<kernel_registor_t *>()
        ->Creates(op_type);
  }

  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type,
                                                TargetType target,
-                                                PrecisionType precision);
+                                                PrecisionType precision,
+                                                DataLayoutType layout);

  // Get a kernel registry offset in all the registries.
-  template <TargetType Target, PrecisionType Precision>
-  static constexpr int GetKernelOffset() {
-    return kNumTargets * static_cast<int>(Target) + static_cast<int>(Precision);
+  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
+  static int GetKernelOffset() {
+    CHECK_LT(static_cast<int>(Target), static_cast<int>(TARGET(NUM)));
+    CHECK_LT(static_cast<int>(Precision), static_cast<int>(PRECISION(NUM)));
+    CHECK_LT(static_cast<int>(Layout), static_cast<int>(DATALAYOUT(NUM)));
+    return static_cast<int>(Target) * static_cast<int>(PRECISION(NUM)) *
+               static_cast<int>(DATALAYOUT(NUM)) +                            //
+           static_cast<int>(Precision) * static_cast<int>(DATALAYOUT(NUM)) +  //
+           static_cast<int>(Layout);
  }

  std::string DebugString() const {
    std::stringstream ss;

    ss << "KernelCreator<host, float>:" << std::endl;
-    ss << registries_[GetKernelOffset<TARGET(kHost), PRECISION(kFloat)>()]
-              .get<
-                  KernelRegistryForTarget<TARGET(kHost), PRECISION(kFloat)> *>()
+    ss << registries_[GetKernelOffset<TARGET(kHost), PRECISION(kFloat),
+                                      DATALAYOUT(kAny)>()]
+              .get<KernelRegistryForTarget<TARGET(kHost), PRECISION(kFloat),
+                                           DATALAYOUT(kNCHW)> *>()
              ->DebugString();
    ss << std::endl;
    return ss.str();
  }

 private:
-  mutable std::array<any_kernel_registor_t, kNumTargets * kNumPrecisions>
+  mutable std::array<any_kernel_registor_t,
+                     static_cast<int>(TARGET(NUM)) *
+                         static_cast<int>(PRECISION(NUM)) *
+                         static_cast<int>(DATALAYOUT(NUM))>
      registries_;
 };

-template <TargetType target, PrecisionType precision, typename KernelType>
+template <TargetType target, PrecisionType precision, DataLayoutType layout,
+          typename KernelType>
 class KernelRegistor : public lite::Registor<KernelType> {
 public:
-  KernelRegistor(const std::string op_type)
-      : Registor<KernelType>([&] {
+  KernelRegistor(const std::string &op_type, const std::string &alias)
+      : Registor<KernelType>([=] {
          LOG(INFO) << "Register kernel " << op_type << " for "
-                    << TargetToStr(target) << " " << PrecisionToStr(precision);
-          KernelRegistry::Global().Register<target, precision>(
-              op_type, [&, op_type]() -> std::unique_ptr<KernelType> {
+                    << TargetToStr(target) << " " << PrecisionToStr(precision)
+                    << " " << DataLayoutToStr(layout) << " alias " << alias;
+          KernelRegistry::Global().Register<target, precision, layout>(
+              op_type, [=]() -> std::unique_ptr<KernelType> {
                std::unique_ptr<KernelType> x(new KernelType);
                x->set_op_type(op_type);
+                x->set_alias(alias);
                return x;
              });
        }) {}
@@ -151,35 +179,40 @@ class KernelRegistor : public lite::Registor<KernelType> {
 #define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \
  op_type__##__##target__##__##precision__##__registor__
 #define LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, \
-                                      alias__)                          \
+                                      layout__, alias__)                \
  op_type__##__##target__##__##precision__##__registor__instance__##alias__
 #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
  LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)

-#define REGISTER_LITE_KERNEL(op_type__, target__, precision__, KernelClass,  \
-                             alias__)                                        \
-  static paddle::lite::KernelRegistor<TARGET(target__),                      \
-                                      PRECISION(precision__), KernelClass>   \
-      LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__,        \
-                                    alias__)(#op_type__);                    \
-  static KernelClass LITE_KERNEL_INSTANCE(op_type__, target__, precision__,  \
-                                          alias__);                          \
-  int touch_##op_type__##target__##precision__##alias__() {                  \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, alias__).Touch(); \
-    return 0;                                                                \
-  }                                                                          \
-  static bool LITE_KERNEL_PARAM_INSTANCE(op_type__, target__, precision__,   \
-                                         alias__) __attribute__((unused)) =  \
-      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),         \
-                                                   PRECISION(precision__)>(  \
-          #op_type__)
-
-#define USE_LITE_KERNEL(op_type__, target__, precision__, alias__)        \
-  extern int touch_##op_type__##target__##precision__##alias__();         \
-  int op_type__##target__##precision__##alias__ __attribute__((unused)) = \
-      touch_##op_type__##target__##precision__##alias__();
-
-#define LITE_KERNEL_INSTANCE(op_type__, target__, precision__, alias__) \
-  op_type__##target__##precision__##alias__
-#define LITE_KERNEL_PARAM_INSTANCE(op_type__, target__, precision__, alias__) \
-  op_type__##target__##precision__##alias__##param_register
+#define REGISTER_LITE_KERNEL(op_type__, target__, precision__, layout__,      \
+                             KernelClass, alias__)                            \
+  static paddle::lite::KernelRegistor<TARGET(target__),                       \
+                                      PRECISION(precision__),                 \
+                                      DATALAYOUT(layout__), KernelClass>      \
+      LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__,         \
+                                    layout__, alias__)(#op_type__, #alias__); \
+  static KernelClass LITE_KERNEL_INSTANCE(op_type__, target__, precision__,   \
+                                          layout__, alias__);                 \
+  int touch_##op_type__##target__##precision__##layout__##alias__() {         \
+    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
+        .Touch();                                                             \
+    return 0;                                                                 \
+  }                                                                           \
+  static bool LITE_KERNEL_PARAM_INSTANCE(op_type__, target__, precision__,    \
+                                         layout__, alias__)                   \
+      __attribute__((unused)) = paddle::lite::ParamTypeRegistry::NewInstance< \
+          TARGET(target__), PRECISION(precision__), DATALAYOUT(layout__)>(    \
+          #op_type__ "/" #alias__)
+
+#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
+  extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
+  int op_type__##target__##precision__##layout__##alias__                    \
+      __attribute__((unused)) =                                              \
+          touch_##op_type__##target__##precision__##layout__##alias__();
+
+#define LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, \
+                             alias__)                                    \
+  op_type__##target__##precision__##layout__##alias__
+#define LITE_KERNEL_PARAM_INSTANCE(op_type__, target__, precision__, layout__, \
+                                   alias__)                                    \
+  op_type__##target__##precision__##layout__##alias__##param_register
--- a/paddle/fluid/lite/core/optimizer.cc
+++ b/paddle/fluid/lite/core/optimizer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/optimizer.h"
+#include "paddle/fluid/lite/core/mir/io_complement_pass.h"
 #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"

 namespace paddle {
@@ -25,5 +26,33 @@ void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) {

  *pass->mutable_kernel_pick_factors() = factor;
 }
+
+void Optimizer::RunPasses() {
+  std::vector<std::string> passes({
+      "static_kernel_pick_pass",        //
+      "variable_place_inference_pass",  //
+      "argument_type_display_pass",     //
+      "io_complement_pass",             //
+      "argument_type_display_pass",     //
+      "variable_place_inference_pass",  //
+      "argument_type_display_pass",     //
+      "io_copy_kernel_pick_pass",       //
+      "variable_place_inference_pass",  //
+  });
+  for (auto& pass_type : passes) {
+    LOG(INFO) << ".. running pass " << pass_type;
+    auto* pass = mir::PassManager::Global().LookUp(pass_type);
+    CHECK(pass);
+    if (pass->name() == "io_complement_pass") {
+      auto* _pass = dynamic_cast<mir::IoComplementPass*>(pass);
+      _pass->SetValidPlaces(valid_places_);
+      CHECK(!_pass->valid_places().empty());
+      _pass->Apply(graph_);
+    } else {
+      pass->Apply(graph_);
+    }
+  }
+  // mir::PassManager::Global().Run(graph_);
+}
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/optimizer.h
+++ b/paddle/fluid/lite/core/optimizer.h
@@ -16,8 +16,10 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/generate_program_pass.h"
+#include "paddle/fluid/lite/core/mir/io_complement_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_manager.h"
 #include "paddle/fluid/lite/core/mir/ssa_graph.h"
+#include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
 #include "paddle/fluid/lite/core/program.h"
 #include "paddle/fluid/lite/core/types.h"

@@ -33,25 +35,46 @@ class Optimizer {
  void Run(Program&& program, const std::vector<Place>& valid_places,
           core::KernelPickFactor kernel_pick_factor,
           const std::vector<std::string>& passes = {}) {
+    valid_places_ = valid_places;
+    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
    CHECK(!graph_) << "duplicate optimize found";
    graph_.reset(new mir::SSAGraph);
    graph_->Build(program, valid_places);
    SpecifyKernelPickTactic(kernel_pick_factor);
+    // InitIoComplement();
    RunPasses();
    exec_scope_ = program.exec_scope;
  }

+  void KernelPickPreferPlace(const Place& place) {
+    auto* pass = mir::PassManager::Global().LookUp<mir::StaticKernelPickPass>(
+        "static_kernel_pick_pass");
+    CHECK(pass);
+    pass->SetPreferPlace(place);
+  }
+
  // Generate a new program based on the mir graph.
  std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
+    LOG(INFO) << "generate program";
    std::unique_ptr<Program> res;
    auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
        "generate_program_pass");
+    pass->Apply(graph_);
    auto program = pass->GenProgram();
    CHECK(exec_scope_);
    program->set_exec_scope(exec_scope_);
    return program;
  }

+  void InitIoComplement() {
+    auto* pass = mir::PassManager::Global().LookUp<mir::IoComplementPass>(
+        "io_complement_pass");
+    CHECK(pass);
+    CHECK(!valid_places_.empty());
+    LOG(INFO) << "valid_places.size " << valid_places_.size();
+    pass->SetValidPlaces(valid_places_);
+  }
+
  // Generate C++ code which combines the inference program, model and weights.
  void GenCode(const std::string& code_dir);

@@ -64,13 +87,14 @@ class Optimizer {
  void SpecifyKernelPickTactic(core::KernelPickFactor factor);

  // Run the default passes registered in the PassManager.
-  void RunPasses() { mir::PassManager::Global().Run(graph_); }
+  void RunPasses();

  // Specify the passes and run them.
  void RunPasses(std::vector<std::string>& passes);

 private:
  std::unique_ptr<mir::SSAGraph> graph_;
+  std::vector<Place> valid_places_;
  lite::Scope* exec_scope_{};
 };


--- a/paddle/fluid/lite/core/program.h
+++ b/paddle/fluid/lite/core/program.h
@@ -84,13 +84,10 @@ struct Program {
    tmp_vars.push_back("fetch");
    for (auto var_desc : program.Block(0).AllVars()) {
      if (!var_desc->Persistable()) {
-        LOG(INFO) << "get tmp var " << var_desc->Name();
        tmp_vars.push_back(var_desc->Name());
-        auto* var = exec_scope->Var(var_desc->Name());
-        LOG(INFO) << "create tmp var " << var_desc->Name() << " " << var;
+        exec_scope->Var(var_desc->Name());
      } else {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
-        LOG(INFO) << "get weight var " << var_desc->Name();
        weights.push_back(var_desc->Name());
      }
    }
@@ -105,15 +102,19 @@ struct Instruction {
  void Run() {
    CHECK(op_);
    CHECK(kernel_);
-    LOG(INFO) << "running kernel> " << kernel_->DebugString();
    if (UNLIKELY(first_epoch_)) {
      first_epoch_ = false;
-      op_->CheckShape();
+      CHECK(op_->CheckShape());
    }
    op_->InferShape();
    kernel_->Run();
  }

+  friend std::ostream& operator<<(std::ostream& os, const Instruction& other) {
+    os << other.kernel_->summary() << "\t(" << other.kernel_->doc() << ")";
+    return os;
+  }
+
 private:
  std::shared_ptr<OpLite> op_;
  std::unique_ptr<KernelBase> kernel_;
@@ -125,11 +126,16 @@ struct Instruction {
 */
 class RuntimeProgram {
 public:
-  explicit RuntimeProgram(std::vector<Instruction>&& instruction)
-      : instructions_(std::move(instruction)) {}
+  explicit RuntimeProgram(std::vector<Instruction>&& insts)
+      : instructions_(std::move(insts)) {
+    if (insts.empty()) {
+      LOG(ERROR) << "no instructions";
+    }
+  }

  void Run() {
    for (auto& inst : instructions_) {
+      LOG(INFO) << ">> Running kernel: " << inst;
      inst.Run();
    }
  }

--- a/paddle/fluid/lite/core/target_wrapper.h
+++ b/paddle/fluid/lite/core/target_wrapper.h
@@ -16,6 +16,10 @@
 #include <glog/logging.h>
 #include <iostream>
 #include <sstream>
+#ifdef LITE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif

 namespace paddle {
 namespace lite {
@@ -26,20 +30,20 @@ enum class TargetType : int {
  kX86,
  kCUDA,
  kAny,  // any target
-  kLastAsPlaceHolder,
+  NUM,   // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,
  kFloat,
  kInt8,
  kAny,  // any precision
-  kLastAsPlaceHolder,
+  NUM,   // number of fields.
 };
 enum class DataLayoutType : int {
  kUnk = 0,
  kNCHW,
  kAny,  // any data layout
-  kLastAsPlaceHolder,
+  NUM,   // number of fields.
 };

 // Some helper macro to get a specific TargetType.
@@ -50,25 +54,29 @@ enum class DataLayoutType : int {
 #define PRECISION_VAL(item__) static_cast<int>(PRECISION(item__))
 #define DATALAYOUT(item__) paddle::lite::DataLayoutType::item__

-constexpr const int kNumPrecisions =
-    PRECISION_VAL(kLastAsPlaceHolder) - PRECISION_VAL(kFloat);
-constexpr const int kNumTargets =
-    TARGET_VAL(kLastAsPlaceHolder) - TARGET_VAL(kHost);
+constexpr const int kNumPrecisions = PRECISION_VAL(NUM);
+constexpr const int kNumTargets = TARGET_VAL(NUM);

 static const std::string target2string[] = {"unk", "host", "x86", "cuda",
                                            "any"};
 static const std::string& TargetToStr(TargetType target) {
-  return target2string[static_cast<int>(target)];
+  auto x = static_cast<int>(target);
+  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
+  return target2string[x];
 }

 static const std::string precision2string[] = {"unk", "float", "int8", "any"};
 static const std::string& PrecisionToStr(PrecisionType precision) {
-  return precision2string[static_cast<int>(precision)];
+  auto x = static_cast<int>(precision);
+  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
+  return precision2string[x];
 }

 static const std::string datalayout2string[] = {"unk", "NCHW", "any"};
-static const std::string& DataLayoutToStr(DataLayoutType x) {
-  return datalayout2string[static_cast<int>(x)];
+static const std::string& DataLayoutToStr(DataLayoutType layout) {
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
 }

 /*
@@ -187,5 +195,37 @@ class TargetWrapper<TARGET(kHost)> {
  }
 };

+#ifdef LITE_WITH_CUDA
+// This interface should be specified by each kind of target.
+template <>
+class TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t> {
+ public:
+  using stream_t = cudaStream_t;
+  using event_t = cudaEvent_t;
+
+  static size_t num_devices() { return 0; }
+  static size_t maximum_stream() { return 0; }
+
+  static void CreateStream(stream_t* stream) {}
+  static void DestroyStream(const stream_t& stream) {}
+
+  static void CreateEvent(event_t* event) {}
+  static void DestroyEvent(const event_t& event) {}
+
+  static void RecordEvent(const event_t& event) {}
+  static void SyncEvent(const event_t& event) {}
+
+  static void StreamSync(const stream_t& stream) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst, const void* src, size_t size,
+                         IoDirection dir);
+  static void MemcpyAsync(void* dst, const void* src, size_t size,
+                          IoDirection dir, const stream_t& stream);
+};
+#endif  // LITE_WITH_CUDA
+
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/type_system.cc
+++ b/paddle/fluid/lite/core/type_system.cc
@@ -87,20 +87,41 @@ const Type* Type::Get<TensorAnyTy>(TargetType target) {
  }
 }

+template <TargetType Target>
+const Type* GetTensorFp32NCHWTy() {
+  static TensorFp32NCHWTy x(Target);
+  return &x;
+}
+
 template <>
 const Type* Type::Get<TensorFp32NCHWTy>(TargetType target) {
  switch (target) {
-    case TargetType::kX86:
-      return Get<false, true, TargetType::kX86, PrecisionType::kFloat,
-                 DataLayoutType::kNCHW>();
-    case TargetType::kHost:
-      return Get<false, true, TargetType::kHost, PrecisionType::kFloat,
-                 DataLayoutType::kNCHW>();
+    case TARGET(kHost):
+      return GetTensorFp32NCHWTy<TARGET(kHost)>();
+    case TARGET(kCUDA):
+      return GetTensorFp32NCHWTy<TARGET(kCUDA)>();
+    case TARGET(kX86):
+      return GetTensorFp32NCHWTy<TARGET(kX86)>();
+    default:
+      LOG(FATAL) << "unsupported target Type " << TargetToStr(target);
+  }
+  return nullptr;
+}

+const Type* LookupType(DataTypeBase::ID type_id, bool is_unknown,
+                       bool is_tensor, Place place) {
+  using id_t = DataTypeBase::ID;
+  switch (type_id) {
+    case id_t::Tensor_Any:
+      return Type::Get<TensorAnyTy>(place.target);
+    case id_t::Tensor_Fp32_NCHW:
+      return Type::Get<TensorFp32NCHWTy>(place.target);
+    case id_t::TensorList_Any:
+      return Type::Get<TensorListAnyTy>(place.target);
    default:
-      LOG(FATAL) << "unsupported target " << TargetToStr(target);
-      return nullptr;
+      LOG(FATAL) << "unsupported type";
  }
+  return nullptr;
 }

 // ------------------------- end GetType specification ------------------------

--- a/paddle/fluid/lite/core/type_system.h
+++ b/paddle/fluid/lite/core/type_system.h
@@ -131,6 +131,23 @@ class Type : public DataTypeBase {
  bool operator==(const Type& other) {
    return id_ == other.id() && place_ == other.place();
  }
+  friend std::ostream& operator<<(std::ostream& os, const Type& other) {
+    if (other.IsUnsupported()) {
+      os << "<Unsupported>";
+      return os;
+    }
+    if (other.IsVoid()) {
+      os << "<Void>";
+      return os;
+    }
+    if (other.is_tensor_) {
+      os << "<Tensor:";
+    }
+    os << TargetToStr(other.target()) << "/"
+       << PrecisionToStr(other.precision()) << "/"
+       << DataLayoutToStr(other.layout()) << ">";
+    return os;
+  }

  // Can cast to another type. This is heavily used in MIR, by determine whether
  // is is possible to add a instruction to transform a type to another.
@@ -163,29 +180,33 @@ class Type : public DataTypeBase {
 };

 // -------------------------------- compatible check ---------------------------
-static bool TargetCompatible(const Type& a, const Type& b) {
-  return (a.IsVoid() || b.IsVoid()) ||  //
-         a.target() == b.target();
+static bool TargetCompatibleTo(const Type& a, const Type& b) {
+  return a.IsVoid() ||                                                  //
+         (a.IsTensor() && b.IsTensor() && (a.target() == b.target() ||  //
+                                           b.target() == TARGET(kAny)));
 }

-static bool DataLayoutCompatible(const Type& a, const Type& b) {
-  return (a.IsVoid() || b.IsVoid()) ||  //
-         (a.IsTensor() && b.IsTensor() && a.layout() == b.layout());
+static bool DataLayoutCompatibleTo(const Type& a, const Type& b) {
+  return a.IsVoid() ||                                                  //
+         (a.IsTensor() && b.IsTensor() && (a.layout() == b.layout() ||  //
+                                           b.layout() == DATALAYOUT(kAny)));
 }

-static bool PrecisionCompatible(const Type& a, const Type& b) {
-  return (a.IsVoid() || b.IsVoid()) ||  //
-         (a.precision() == b.precision());
+static bool PrecisionCompatibleTo(const Type& a, const Type& b) {
+  return a.IsVoid() ||                                                        //
+         (a.IsTensor() && b.IsTensor() && (a.precision() == b.precision() ||  //
+                                           b.precision() == PRECISION(kAny)));
 }

-static bool DeviceCompatible(const Type& a, const Type& b) {
-  return (a.IsVoid() || b.IsVoid()) ||  //
-         (a.device() == b.device());
+static bool DeviceCompatibleTo(const Type& a, const Type& b) {
+  return a.IsVoid() ||  //
+         (a.IsTensor() && b.IsTensor() && (a.device() == b.device()));
 }

-static bool TypeCompatible(const Type& a, const Type& b) {
-  return TargetCompatible(a, b) && DataLayoutCompatible(a, b) &&
-         PrecisionCompatible(a, b) && DeviceCompatible(a, b);
+// Can type 'a' be passed to 'b' directly.
+static bool TypeCompatibleTo(const Type& a, const Type& b) {
+  return TargetCompatibleTo(a, b) && DataLayoutCompatibleTo(a, b) &&
+         PrecisionCompatibleTo(a, b) && DeviceCompatibleTo(a, b);
 }

 // -------------------------------- predefined types ---------------------------
@@ -230,6 +251,9 @@ class TensorInt64NCHWTy : public Type {
      : Type(ID::Tensor_Int64_NCHW, "TensorInt64NCHW", true /*is_tensor*/,
             target, PrecisionType::kInt8, DataLayoutType::kNCHW) {}
 };
+
+const Type* LookupType(DataTypeBase::ID type_id, bool is_unknown,
+                       bool is_tensor, Place place);
 // ------------------------- end predefined types ---------------------------

 // NOTE TypeSystem has some overhead, and better to be used in analysis phase.
@@ -381,13 +405,15 @@ class ParamTypeRegistry {
    CHECK(types_.count(key));
  }

-  template <IO io>
-  const ParamType* Retrieve(const Place& place, const std::string& op_type,
-                            const std::string& arg_name) {
-    KernelIdTy key{op_type, place, io, arg_name};
-    auto it = types_.find(key);
-    if (it == types_.end()) return nullptr;
-    return &it->second;
+  const ParamType* RetrieveInArgument(const Place& place,
+                                      const std::string& op_type,
+                                      const std::string& arg_name) {
+    return Retrieve<IO::kInput>(place, op_type, arg_name);
+  }
+  const ParamType* RetrieveOutArgument(const Place& place,
+                                       const std::string& op_type,
+                                       const std::string& arg_name) {
+    return Retrieve<IO::kOutput>(place, op_type, arg_name);
  }

  static ParamTypeRegistry& Global() {
@@ -403,6 +429,16 @@ class ParamTypeRegistry {
    return os;
  }

+ protected:
+  template <IO io>
+  const ParamType* Retrieve(const Place& place, const std::string& op_type,
+                            const std::string& arg_name) {
+    KernelIdTy key{op_type, place, io, arg_name};
+    auto it = types_.find(key);
+    if (it == types_.end()) return nullptr;
+    return &it->second;
+  }
+
 private:
  ParamTypeRegistry() = default;


--- a/paddle/fluid/lite/core/types.cc
+++ b/paddle/fluid/lite/core/types.cc
@@ -43,6 +43,9 @@ bool KernelPickFactor::IsTargetConsidered() const {
 bool KernelPickFactor::IsDataLayoutConsidered() const {
  return data_ & static_cast<int>(Factor::DataLayoutFirst);
 }
+bool KernelPickFactor::IsDeviceConsidered() const {
+  return data_ & static_cast<int>(Factor::DeviceFirst);
+}

 }  // namespace core
 }  // namespace lite

--- a/paddle/fluid/lite/core/types.h
+++ b/paddle/fluid/lite/core/types.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <stack>
 #include "paddle/fluid/lite/core/context.h"
 #include "paddle/fluid/lite/utils/all.h"

@@ -38,6 +39,7 @@ class KernelPickFactor {
  bool AnyFactorConsidered() const { return data_; }

  KernelPickFactor& ConsiderTarget();
+  // Perfer a specific target, e.g. prefer CUDA kernels.
  KernelPickFactor& ConsiderPrecision();
  KernelPickFactor& ConsiderDataLayout();
  KernelPickFactor& ConsiderDevice();
@@ -45,12 +47,29 @@ class KernelPickFactor {
  bool IsTargetConsidered() const;
  bool IsPrecisionConsidered() const;
  bool IsDataLayoutConsidered() const;
-  bool IsDeviceConsidered() const {
-    return data_ & static_cast<int>(Factor::DeviceFirst);
+  bool IsDeviceConsidered() const;
+
+  friend std::ostream& operator<<(std::ostream& os, const KernelPickFactor& k) {
+    std::stack<bool> bits;
+    auto data = k.data_;
+    while (data) {
+      bits.push(data % 2);
+      data /= 2;
+    }
+    int nbits = bits.size();
+    for (size_t i = 0; i < sizeof(data) * 8 - nbits; i++) {
+      os << 0;
+    }
+    while (!bits.empty()) {
+      os << bits.top();
+      bits.pop();
+    }
+    return os;
  }

 private:
  unsigned char data_{};
+  TargetType target_{TARGET(kUnk)};
 };

 struct dim2 {

--- a/paddle/fluid/lite/cuda/target_wrapper.cc
+++ b/paddle/fluid/lite/cuda/target_wrapper.cc
@@ -12,10 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-//
-// Created by chunwei on 19-2-23.
-//
-
 #include "paddle/fluid/lite/cuda/target_wrapper.h"
 #include <glog/logging.h>

@@ -24,19 +20,14 @@ namespace lite {

 using TargetW = TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t>;

-template <>
 void* TargetW::Malloc(size_t size) {
  void* ptr{};
  CHECK_EQ(cudaSuccess, cudaMalloc(&ptr, size));
  return ptr;
 }

-template <>
-void TargetW::Free(void* ptr) {
-  CHECK_EQ(cudaSuccess, cudaFree(ptr));
-}
+void TargetW::Free(void* ptr) { CHECK_EQ(cudaSuccess, cudaFree(ptr)); }

-template <>
 void TargetW::MemcpySync(void* dst, const void* src, size_t size,
                         IoDirection dir) {
  switch (dir) {
@@ -55,7 +46,6 @@ void TargetW::MemcpySync(void* dst, const void* src, size_t size,
  }
 }

-template <>
 void TargetW::MemcpyAsync(void* dst, const void* src, size_t size,
                          IoDirection dir, const stream_t& stream) {
  switch (dir) {

--- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
-cc_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite)
+nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite)
 cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS tensor_lite)
+
+nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda)
--- a/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc
+++ b/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace kernels {
 namespace cuda {

-using TargetW = TargetWrapper<TARGET(kHost), cudaStream_t, cudaEvent_t>;
+using TargetW = TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t>;

 // Host to CUDA memory.
 void CopyFromHostSync(void* target, const void* source, size_t size) {
@@ -51,6 +51,25 @@ class IoCopyHostToCudaCompute
    auto* data = param.y->mutable_data(target(), param.x->memory_size());
    CopyFromHostSync(data, param.x->data<void>(), param.x->memory_size());
  }
+
+  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      auto* type = inputs.at("Input");
+      CHECK(type->target() == TARGET(kHost));
+
+      auto out_place = type->place();
+      out_place.target = TARGET(kCUDA);
+      auto* out_type = LookupType(type->id(), type->IsUnsupported(),
+                                  type->IsUnsupported(), out_place);
+      return out_type;
+    };
+    return res;
+  }
+
+  std::string doc() const override { return "Copy IO from HOST to CUDA"; }
 };

 /*
@@ -65,6 +84,8 @@ class IoCopyCudaToHostCompute
    auto* data = param.y->mutable_data(TARGET(kHost), param.x->memory_size());
    CopyToHostSync(data, param.x->data<void>(), param.x->memory_size());
  }
+
+  std::string doc() const override { return "Copy IO from CUDA to HOST"; }
 };

 }  // namespace cuda
@@ -72,7 +93,7 @@ class IoCopyCudaToHostCompute
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(io_copy, kCUDA, kAny,
+REGISTER_LITE_KERNEL(io_copy, kCUDA, kAny, kAny,
                     paddle::lite::kernels::cuda::IoCopyHostToCudaCompute,
                     host_to_device)
    .BindInput("Input", {paddle::lite::Type::Get<paddle::lite::TensorAnyTy>(
@@ -81,7 +102,7 @@ REGISTER_LITE_KERNEL(io_copy, kCUDA, kAny,
                           TARGET(kCUDA))})
    .Finalize();

-REGISTER_LITE_KERNEL(io_copy, kCUDA, kAny,
+REGISTER_LITE_KERNEL(io_copy, kCUDA, kAny, kAny,
                     paddle::lite::kernels::cuda::IoCopyCudaToHostCompute,
                     device_to_host)
    .BindInput("Input", {paddle::lite::Type::Get<paddle::lite::TensorAnyTy>(

--- a/paddle/fluid/lite/kernels/cuda/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/cuda/mul_compute.cc
@@ -13,3 +13,22 @@
 // limitations under the License.

 #include "paddle/fluid/lite/kernels/cuda/mul_compute.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW,
+                     paddle::lite::kernels::cuda::MulCompute, def)
+    .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
+                        TARGET(kCUDA))})
+    .BindInput("Y", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
+                        TARGET(kCUDA))})
+    .BindOutput("Out", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
+                           TARGET(kCUDA))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/cuda/mul_compute.h
+++ b/paddle/fluid/lite/kernels/cuda/mul_compute.h
@@ -16,6 +16,7 @@
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/types.h"
 #include "paddle/fluid/lite/cuda/blas.h"
+#include "paddle/fluid/lite/operators/op_params.h"

 namespace paddle {
 namespace lite {
@@ -29,11 +30,29 @@ void mul_compute(const lite::cuda::Blas<float>& blas, const T* x, int x_h,
             nullptr, out, 0);
 }

-class MulCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
+class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
 public:
  using param_t = operators::MulParam;

-  void Run() override {}
+  void Run() override {
+    CHECK(context_) << "running context should be set first";
+    auto& context = context_->AsCudaContext();
+    CHECK(context.blas_fp32) << "blas should init first";
+    auto& blas = *context.blas_fp32;
+    const auto& param = Param<operators::MulParam>();
+    CHECK(param.x->target() == TARGET(kCUDA));
+    auto* x = param.x->data<float>();
+    int x_h = param.x->dims()[0];
+    int x_w = param.x->dims()[1];
+
+    auto* y = param.y->data<float>();
+    int y_h = param.y->dims()[0];
+    int y_w = param.y->dims()[1];
+
+    auto* out = param.output->mutable_data<float>(TARGET(kCUDA));
+
+    mul_compute<float>(blas, x, x_h, x_w, y, y_h, y_w, out);
+  }

  virtual ~MulCompute() = default;
 };

--- a/paddle/fluid/lite/kernels/host/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/host/fc_compute.cc
@@ -51,8 +51,8 @@ void FcCompute::Run() {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(fc, kHost, kFloat, paddle::lite::kernels::host::FcCompute,
-                     def)
+REGISTER_LITE_KERNEL(fc, kHost, kFloat, kNCHW,
+                     paddle::lite::kernels::host::FcCompute, def)
    .BindInput("Input",
               {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
                   TARGET(kHost))})

--- a/paddle/fluid/lite/kernels/host/feed_compute.cc
+++ b/paddle/fluid/lite/kernels/host/feed_compute.cc
@@ -20,7 +20,8 @@ namespace lite {
 namespace kernels {
 namespace host {

-class FeedCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
+class FeedCompute
+    : public OpKernel<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
  using param_t = operators::FeedParam;

@@ -38,7 +39,7 @@ class FeedCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(feed, kHost, kFloat,
+REGISTER_LITE_KERNEL(feed, kHost, kAny, kAny,
                     paddle::lite::kernels::host::FeedCompute, def)
    .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorAnyTy>(
                        TARGET(kHost))})

--- a/paddle/fluid/lite/kernels/host/fetch_compute.cc
+++ b/paddle/fluid/lite/kernels/host/fetch_compute.cc
@@ -20,7 +20,8 @@ namespace lite {
 namespace kernels {
 namespace host {

-class FetchCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
+class FetchCompute
+    : public OpKernel<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
  using param_t = operators::FeedParam;

@@ -41,7 +42,7 @@ class FetchCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(fetch, kHost, kFloat,
+REGISTER_LITE_KERNEL(fetch, kHost, kAny, kAny,
                     paddle::lite::kernels::host::FetchCompute, def)
    .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorAnyTy>(
                        TARGET(kHost))})

--- a/paddle/fluid/lite/kernels/host/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/host/mul_compute.cc
@@ -67,7 +67,7 @@ class MulCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(mul, kHost, kFloat,
+REGISTER_LITE_KERNEL(mul, kHost, kFloat, kNCHW,
                     paddle::lite::kernels::host::MulCompute, def)
    .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
                        TARGET(kHost))})

--- a/paddle/fluid/lite/kernels/host/relu_compute.h
+++ b/paddle/fluid/lite/kernels/host/relu_compute.h
@@ -42,6 +42,6 @@ class ReluCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(relu, kHost, kFloat,
+REGISTER_LITE_KERNEL(relu, kHost, kFloat, kNCHW,
                     paddle::lite::kernels::host::ReluCompute, def)
    .Finalize();
--- a/paddle/fluid/lite/kernels/host/scale_compute.cc
+++ b/paddle/fluid/lite/kernels/host/scale_compute.cc
@@ -50,7 +50,7 @@ class ScaleCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(scale, kHost, kFloat,
+REGISTER_LITE_KERNEL(scale, kHost, kFloat, kNCHW,
                     paddle::lite::kernels::host::ScaleCompute, def)
    .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
                        TARGET(kHost))})

--- a/paddle/fluid/lite/operators/io_copy_op.cc
+++ b/paddle/fluid/lite/operators/io_copy_op.cc
@@ -24,7 +24,10 @@ bool IoCopyOp::CheckShape() const {
  CHECK_OR_FALSE(param_.y);
  return true;
 }
-bool IoCopyOp::InferShape() const { return true; }
+bool IoCopyOp::InferShape() const {
+  param_.y->Resize(param_.x->dims());
+  return true;
+}
 bool IoCopyOp::Run() { return OpLite::Run(); }
 bool IoCopyOp::AttachImpl(const paddle::framework::OpDesc &opdesc,
                          paddle::lite::Scope *scope) {

--- a/paddle/fluid/lite/operators/mul_op.h
+++ b/paddle/fluid/lite/operators/mul_op.h
@@ -51,9 +51,6 @@ class MulOpLite : public OpLite {
    param_.x_num_col_dims = boost::get<int>(op_desc.GetAttr("x_num_col_dims"));
    param_.y_num_col_dims = boost::get<int>(op_desc.GetAttr("y_num_col_dims"));

-    CHECK(kernel_);
-    kernel_->SetParam(param_);
-
    return true;
  }


--- a/paddle/fluid/lite/utils/factory.h
+++ b/paddle/fluid/lite/utils/factory.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <glog/logging.h>
 #include <iostream>
 #include <list>
 #include <memory>
@@ -48,8 +49,6 @@ class Factory {
  }

  void Register(const std::string& op_type, creator_t&& creator) {
-    CHECK(!creators_.count(op_type)) << "The op " << op_type
-                                     << " has already registered";
    creators_[op_type].emplace_back(std::move(creator));
  }

@@ -58,9 +57,9 @@ class Factory {
  }

  std::list<item_ptr_t> Creates(const std::string& op_type) const {
-    auto it = creators_.find(op_type);
-    CHECK(it != creators_.end()) << "no item called " << op_type;
    std::list<item_ptr_t> res;
+    auto it = creators_.find(op_type);
+    if (it == creators_.end()) return res;
    for (auto& c : it->second) {
      res.emplace_back(c());
    }

--- a/paddle/fluid/lite/utils/varient.h
+++ b/paddle/fluid/lite/utils/varient.h
@@ -99,7 +99,7 @@ struct variant {

  size_t type() { return type_id; }

-  void valid() { return (type_id != invalid_type()); }
+  bool valid() { return (type_id != invalid_type()); }

  template <typename T, typename... Args>
  void set(Args&&... args) {

--- a/paddle/fluid/lite/utils/varient_test.cc
+++ b/paddle/fluid/lite/utils/varient_test.cc
@@ -24,6 +24,8 @@ namespace utils {

 TEST(varient, test) {
  variant<int, float> a;
+  // The initial state should be invalid.
+  ASSERT_FALSE(a.valid());
  a.set<int>(1);
  ASSERT_EQ(a.get<int>(), 1);
  a.set<int>(20);