diff --git a/.gitignore b/.gitignore
index ce40fea2be877c09bb299781d8937c081843b50c..9db2912c07bc2d6abb01c322a25519ac0ff158fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,10 +104,3 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
-
-# generated files
-lite/api/paddle_use_kernels.h
-lite/api/paddle_use_ops.h
-lite/backends/arm/math/dotprod/gemm_sdot.h
-lite/tools/cmake_tools/ast.pyc
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 786b1322b346631d1570a6ebd9c572302531db4e..77a94bea1efcdafaa67b4c078bfb0a756f7b1cec 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,8 +22,6 @@ if (WITH_PADDLE_MOBILE)
     return()
 endif(WITH_PADDLE_MOBILE)
 
-# set(CMAKE_BUILD_TYPE DEBUG)
-
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)
diff --git a/fpga.sh b/fpga.sh
deleted file mode 100644
index e0501ac14b5269139688169017c057bd2458ab7c..0000000000000000000000000000000000000000
--- a/fpga.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-./lite/tools/build.sh \
-  --arm_os=armlinux \
-  --arm_abi=armv8 \
-  --arm_lang=gcc \
-  test
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index f5b7ea4d9f43b2a8802cd86da98bb8e95197d896..986796b4fbd1f6100eef030e46d3cf981fe717d4 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -41,6 +41,7 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(kernel_place_correct_pass)
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 81ae57fc88b25dd907c21efab7f79dfe7e524d98..9b1189c407d6d601bb3e5ba8172b1455f04710fd 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -33,7 +33,7 @@ class Debugger {
 
   void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
     if (op_config[op_type]) {
-      // tensor->saveToFile(op_type, true);
+      tensor->saveToFile(op_type, true);
     }
   }
 
@@ -43,6 +43,8 @@ class Debugger {
     op_config["concat"] = true;
     op_config["pooling"] = true;
     op_config["conv"] = true;
+    op_config["dwconv"] = true;
+    op_config["ew_add"] = true;
     op_config["crop"] = true;
     op_config["feed"] = true;
     op_config["mul"] = true;
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
old mode 100644
new mode 100755
index bf5ab6212b852fdf1cb2a1b9856a1be5fccb7cf9..1408a034cb6a975e32d92da0406f98df7f2409c1
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -61,7 +61,9 @@ void reset_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
+
 #ifdef PADDLE_MOBILE_OS_LINUX
+
   void *ptr = reinterpret_cast<void *>(
       mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
   if (ptr == MAP_FAILED) {
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
index 37c300bd8658a9794263add630a055e27366797b..cea22e0edc647b3bf4f0ac15e43121b5d8926154 100755
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -656,7 +656,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
   }
   size_t size = params.size();
   if (ret == 0 && size > 1) {
-    // Tensor* output = conv_params.output;
     Tensor& img = params[0]->output;
     for (int i = 0; i < 1; i++) {
       for (int i = 0; i < img.shape().numel(); i++) {
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
index 4bb5dc28009376307fea442093f3d9df55ecb894..9958990af6eb237d2122a63e1b7ed947ca329d31 100755
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -96,6 +96,7 @@ class DepthwiseConvPE : public PE {
       float16* scale_data = param_.scale()->data<float16>();
       float16* filter_data = param.quantizedFilter()->mutableData<float16>(
           FP16, param.filter->shape());
+
       // memcpy(filter_data, scale_data, channel * sizeof(float16));
       memcpy(filter_data,
              scale_data,
diff --git a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp
index 0e49a006abfc45203008bc6da99270b847889207..299ffb872b4620fc409eb8e66760a6308a814efb 100755
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -121,7 +121,6 @@ class GRUPE : public PE {
         prev_hidden_.copyFrom(value.pre_output);
       }
       mul_pe_.dispatch();
-      // reset_hidden_.saveToFile("reset_hidden_.txt");
       update_gate_data += stride_update;
       reset_gate_data += stride_update;
 
@@ -172,7 +171,6 @@ class GRUPE : public PE {
   zynqmp::Tensor bias_;
   zynqmp::Tensor weight_;
   zynqmp::Tensor state_weight_;
-
   zynqmp::Tensor update_gate_;
   zynqmp::Tensor reset_gate_;
   zynqmp::Tensor cell_state_;
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
old mode 100755
new mode 100644
index 8cc3188018105f2ae93bf9b434820d24cb18a751..988bc1bb507036de8f13a6c6549c549718bd1256
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -346,19 +346,9 @@ class Tensor {
     if (placeHolder_ == nullptr) {
       return;
     }
-    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
   }
 
-  void printScale(std::string type) {
-    std::cout << type << " : "
-              << std::to_string(shape_->num()) + "_" +
-                     std::to_string(shape_->channel()) + "_" +
-                     std::to_string(shape_->height()) + "_" +
-                     std::to_string(shape_->width())
-              << std::endl;
-    std::cout << type << " \n";
-    printScale();
-  }
+  void printScale(std::string type) { printScale(); }
 
   std::string dimsFileName() {
     return std::to_string(shape_->num()) + "_" +
@@ -386,7 +376,6 @@ class Tensor {
     static int counter = 0;
     std::string npath = std::to_string(counter) + "_" + path;
     counter++;
-    std::cout << "======== saving file:" << npath << " ============\n";
     save_file_with_name(npath);
   }
 
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
old mode 100755
new mode 100644
index 49aded3d7d7db6d293e13298d98c2f3b165f411f..311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -165,9 +165,6 @@ class TensorLite {
 
   TargetType target() const { return target_; }
 
-  // template <typename T>
-  // TensorLite Slice(int64_t begin, int64_t end) const;
-
   zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
 
   friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -257,7 +254,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
     int64_t base = numel() / dims_[0];
 
     TensorLite dst;
-
     dst.target_ = target_;
     auto dst_dims = dims_;
     dst_dims[0] = end - begin;
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index a32e0295dbfc2b3e635472649b437b64f1e93145..fe03492a78ed8573182ed1c874b07a14bd7fa912 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -25,6 +25,7 @@ lite_cc_library(mir_passes
       elimination/elementwise_mul_constant_eliminate_pass.cc
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
+      kernel_place_correct_pass.cc
       type_target_cast_pass.cc
       type_layout_cast_pass.cc
       type_precision_cast_pass.cc
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index ff5a7a1f25239d9dbfc79491bd137804b16b6cfa..2720404fb03cddaf00c9a25d8287b14d69ca86e8 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -27,10 +27,24 @@ namespace mir {
 void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // delete quant node
   std::vector<std::string> quant_op_types = {
-      "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
+      "fake_quantize_abs_max",
+      "fake_quantize_range_abs_max",
+      "fake_quantize_moving_average_abs_max"};
+  /*
+  for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
+    for (int i = 5; i >= 1; --i){
+      fusion::DynamicQuantDequantOpFuser fuser("fake_quantize_abs_max", op_type,
+  i);
+      fuser(graph.get());
+    }
+  }
+  */
+
   for (auto& op_type : quant_op_types) {
     fusion::DeleteQuantOpFuser fuser(op_type);
     fuser(graph.get());
+    fusion::DeleteDynamicQuantOpFuser dfuser(op_type);
+    dfuser(graph.get());
   }
 
   // fuse quantized node and dequant node
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
old mode 100644
new mode 100755
index da611e4490f4ba7268d9011b3dbb391a63a88305..578fac7eea151e2df95d777ffaeb20250f543b92
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -77,6 +77,55 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
   return op_desc;
 }
 
+void DeleteDynamicQuantOpFuser::BuildPattern() {
+  auto* input_act_node =
+      VarNode("input_act_node")->assert_is_op_input(quant_op_type_, "X");
+  auto* quant_node =
+      OpNode("quant_node", quant_op_type_)->assert_is_op(quant_op_type_);
+  auto* output_scale_node =
+      VarNode("output_scale_node")
+          ->assert_is_op_output(quant_op_type_, "OutScale");
+  auto* output_act_node =
+      VarNode("output_act_node")->assert_is_op_output(quant_op_type_, "Out");
+
+  quant_node->LinksFrom({input_act_node});
+  output_scale_node->LinksFrom({quant_node});
+  output_act_node->LinksFrom({quant_node});
+  VLOG(4) << "DeleteQuantOpFuser BuildPattern quant_op_type:" << quant_op_type_;
+}
+
+void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph,
+                                              const key2nodes_t& matched) {
+  auto* input_act_node = matched.at("input_act_node");
+  auto* quant_node = matched.at("quant_node");
+  auto* output_scale_node = matched.at("output_scale_node");
+  auto* output_act_node = matched.at("output_act_node");
+
+  // obtain values, save values and relink node
+  int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
+  int range = ((1 << (bit_length - 1)) - 1);
+  auto* scope = quant_node->stmt()->op()->scope();
+  auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float scale_value = scale_tensor->data<float>()[0] / range;
+
+  auto outlinks = output_act_node->outlinks;
+  for (auto* quantized_node : outlinks) {
+    auto* op_desc = quantized_node->stmt()->mutable_op_info();
+    op_desc->SetAttr<int>("bit_length", bit_length);
+    IR_NODE_LINK_TO(input_act_node, quantized_node)
+  }
+
+  // delete nodes and edges
+  std::unordered_set<const Node*> nodes2rm = {
+      quant_node, output_scale_node, output_act_node};
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+cpp::OpDesc DeleteDynamicQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc;
+  return op_desc;
+}
 void DequantOpFuser::BuildPattern() {
   std::string weight_name = "";
   if (quantized_op_type_ == "conv2d" ||
@@ -130,8 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto& valid_places = quantized_op->stmt()->op()->valid_places();
   int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
   int range = ((1 << (bit_length - 1)) - 1);
-  float input_scale =
-      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
+  float input_scale = 0;
+  if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
+    input_scale =
+        quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
+  }
   float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
   float whole_weight_scale =
       static_cast<float>(range * range) / max_range / range;
@@ -162,8 +214,12 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   for (int i = 0; i < weight_scale_size; i++) {
     weight_scale.push_back(whole_weight_scale);
   }
+  #ifndef LITE_WITH_FPGA
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("input_scale", input_scale);
+  #endif
+  if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
+    op_desc.SetAttr("input_scale", input_scale);
+  }
   op_desc.SetAttr("weight_scale", weight_scale);
 
   // change the weight from the float type to int8 type.
@@ -171,12 +227,30 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   temp_tensor.CopyDataFrom(*quantized_weight_t);
   float* temp_data = temp_tensor.mutable_data<float>();
   size_t weight_num = quantized_weight_t->data_size();
+
+#ifdef LITE_WITH_FPGA
+  float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
+  for (size_t i = 0; i < weight_num; i++) {
+    quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
+  }
+  quantized_weight_t->set_persistable(true);
+  quantized_weight_t->set_precision(PRECISION(kFloat));
+#else
   int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
   for (size_t i = 0; i < weight_num; i++) {
     quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
   }
   quantized_weight_t->set_persistable(true);
   quantized_weight_t->set_precision(PRECISION(kInt8));
+#endif
+
+
+  // int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
+  // for (size_t i = 0; i < weight_num; i++) {
+  //   quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
+  // }
+  // quantized_weight_t->set_persistable(true);
+  // quantized_weight_t->set_precision(PRECISION(kInt8));
 
   // new op and relink nodes
   auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
@@ -464,6 +538,194 @@ cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
   cpp::OpDesc op_desc;
   return op_desc;
 }
+//  ================dynamic quant fuse==============
+// #define DYNAMIC_RANGE
+void DynamicQuantDequantOpFuser::BuildPattern() {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  std::string weight_name = "";
+  if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+    weight_name = "Filter";
+  } else {
+    weight_name = "Y";
+  }
+  auto* quant_op_input = VarNode("quant_op_input")
+                             ->assert_is_op_input(quant_type_, "X")
+                             ->AsInput();
+#ifdef DYNAMIC_RANGE
+  auto* quant_op_in_scale = VarNode("quant_op_in_scale")
+                                ->assert_is_op_input(quant_type_, "InScale")
+                                ->AsIntermediate();
+#endif
+  auto* quant_op = OpNode("quant_op", quant_type_)
+                       ->assert_is_op(quant_type_)
+                       ->AsIntermediate();
+
+  auto* quant_op_out_scale =
+      VarNode("quant_op_out_scale")
+          ->assert_is_op_output(quant_type_, "OutScale")
+          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
+          ->AsIntermediate();
+
+  auto* quant_op_out = VarNode("quant_op_out")
+                           ->assert_is_op_output(quant_type_, "Out")
+                           ->assert_is_op_input(op_type_)
+                           ->AsIntermediate();
+  std::vector<PMNode*> nodes;
+  for (int i = 0; i < times_; i++) {
+    nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
+                        ->assert_is_op_input(op_type_, weight_name)
+                        ->AsInput());
+
+    nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
+                        ->assert_is_op(op_type_)
+                        ->AsIntermediate());
+
+    nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
+                        ->assert_is_op_output(op_type_)
+                        ->assert_is_op_input("fake_dequantize_max_abs", "X")
+                        ->AsIntermediate());
+
+    nodes.push_back(
+        OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
+            ->assert_is_op("fake_dequantize_max_abs")
+            ->AsIntermediate());
+    nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
+                        ->assert_is_op_output("fake_dequantize_max_abs", "Out")
+                        ->AsOutput());
+  }
+
+#ifdef DYNAMIC_RANGE
+  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
+#endif
+  quant_op->LinksFrom({quant_op_input});
+  quant_op_out->LinksFrom({quant_op});
+  quant_op_out_scale->LinksFrom({quant_op});
+  for (int i = 0; i < times_; i++) {
+    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
+        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
+    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOffset]});
+    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kDequantOpOffset]});
+  }
+}
+
+void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
+                                               const key2nodes_t& matched) {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  auto* quant_op_input = matched.at("quant_op_input");
+#ifdef DYNAMIC_RANGE
+  auto* quant_op_in_scale = matched.at("quant_op_in_scale");
+#endif
+  auto* quant_op = matched.at("quant_op");
+
+  std::vector<Node*> nodes;
+  for (int i = 0; i < times_; i++) {
+    nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
+    nodes.push_back(matched.at(string_format("quantized_op%d", i)));
+    nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
+    nodes.push_back(matched.at(string_format("dequant_op%d", i)));
+    nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
+  }
+  int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
+  auto* scope = quant_op->stmt()->op()->scope();
+  auto& valid_places = quant_op->stmt()->op()->valid_places();
+  int range = ((1 << (bit_length - 1)) - 1);
+
+#ifdef DYNAMIC_RANGE
+  auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float input_scale = input_scale_t->data<float>()[0] / range;
+  VLOG(4) << "range: " << range << " input_scale: " << input_scale;
+#endif
+  for (int i = 0; i < times_; i++) {
+    float max_range = nodes[i * kNumFields + kDequantOpOffset]
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<float>("max_range");
+    // weight_scale = max(abs(weight))
+    float whole_weight_scale =
+        static_cast<float>(range * range) / max_range / range;
+
+    cpp::OpDesc op_desc =
+        *nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info();
+
+    auto quantized_weight_var_name =
+        nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name;
+    auto quantized_weight_t =
+        scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
+    std::vector<float> weight_scale;
+    int weight_scale_size;
+
+    if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+      op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name});
+      op_desc.SetOutput(
+          "Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
+      // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
+      // be Cout.
+      weight_scale_size = quantized_weight_t->dims()[0];
+    } else if (op_type_ == "mul") {
+      op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name});
+      op_desc.SetOutput(
+          "Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
+      // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
+      weight_scale_size = quantized_weight_t->dims()[1];
+    }
+    for (int i = 0; i < weight_scale_size; i++) {
+      weight_scale.push_back(whole_weight_scale);
+    }
+    // op_desc.SetAttr("enable_int8", true);
+    // op_desc.SetAttr("input_scale", input_scale);
+    op_desc.SetAttr("weight_scale", weight_scale);
+
+    Tensor temp_tensor;
+    temp_tensor.CopyDataFrom(*quantized_weight_t);
+    float* temp_data = temp_tensor.mutable_data<float>();
+    size_t weight_num = quantized_weight_t->data_size();
+    quantized_weight_t->set_persistable(true);
+    std::cout << "DynamicQuantDequantOpFuser::InsertNewNode============================================================" << std::endl;
+#ifdef LITE_WITH_FPGA
+    float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
+    for (size_t i = 0; i < weight_num; i++) {
+      quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
+      std::cout << whole_weight_scale << "," << temp_data[i] << "," << quantized_weight_data[i] << std::endl;
+    }
+    quantized_weight_t->set_precision(PRECISION(kFloat));
+#else
+    int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
+    for (size_t i = 0; i < weight_num; i++) {
+      quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
+    }
+    quantized_weight_t->set_precision(PRECISION(kInt8));
+#endif
+    auto quantized_op = LiteOpRegistry::Global().Create(op_type_);
+    quantized_op->Attach(op_desc, scope);
+    auto* new_op_node =
+        graph->GraphCreateInstructNode(quantized_op, valid_places);
+    IR_NODE_LINK_TO(quant_op_input, new_op_node);
+    IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset],
+                    new_op_node);
+    IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]);
+  }
+}
+
+cpp::OpDesc DynamicQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc;
+  return op_desc;
+}
 
 }  // namespace fusion
 }  // namespace mir
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
index bef9f4d9573d049700736c166cd0d31b668f7eff..c21df350f96143a09b3229776bf5c013b1988559 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -52,6 +52,19 @@ class DeleteQuantOpFuser : public FuseBase {
  private:
   std::string quant_op_type_{};
 };
+class DeleteDynamicQuantOpFuser : public FuseBase {
+ public:
+  explicit DeleteDynamicQuantOpFuser(const std::string& quant_op_type)
+      : quant_op_type_(quant_op_type) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string quant_op_type_{};
+};
 
 /* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs.
 */
@@ -106,6 +119,24 @@ class DeleteQuantDequantOpFuser : public FuseBase {
  private:
   std::string quantized_op_type_{};
 };
+// dynamic quantdequant op fuser
+class DynamicQuantDequantOpFuser : public FuseBase {
+ public:
+  explicit DynamicQuantDequantOpFuser(const std::string& quantized_op_type,
+                                      const std::string& op_type,
+                                      int i)
+      : op_type_(op_type), quant_type_(quantized_op_type), times_(i) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string op_type_{};
+  std::string quant_type_{};
+  int times_{1};
+};
 
 }  // namespace fusion
 }  // namespace mir
diff --git a/lite/core/mir/kernel_place_correct_pass.cc b/lite/core/mir/kernel_place_correct_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dad7687bbec1ddbd7c8c787338005955de964f17
--- /dev/null
+++ b/lite/core/mir/kernel_place_correct_pass.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/kernel_place_correct_pass.h"
+#include <memory>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void KernelPlaceCorrectPass::Apply(const std::unique_ptr<SSAGraph> &graph) {
+  CorrectArgumentPlace(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(kernel_place_correct_pass,
+                  paddle::lite::mir::KernelPlaceCorrectPass)
+    .BindTargets({TARGET(kFPGA)});
diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fab5000862378976c16448f5a82f052ffbc20a5
--- /dev/null
+++ b/lite/core/mir/kernel_place_correct_pass.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * Correct the place of the variables in the SSAGrpah, it will inference the
+ * variables' place by the kernels outputs them.
+ */
+class KernelPlaceCorrectPass : public DebugPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  void CorrectArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+
+    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
+    for (auto& x : graph->StmtTopologicalOrder()) {
+      auto& inst = x->AsStmt();
+      // The IoCopyOp is a tool operator, it won't support the type inference.
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
+      // for
+      // tool operator
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        if (inst.op_type() == "io_copy") continue;
+      }
+      // deal with inputs
+      VLOG(4) << "checking op " << inst.op_info()->Repr();
+
+      auto get_argname = [&](
+          const std::string& node_name,
+          const std::map<std::string, std::vector<std::string>>& argname_map)
+          -> std::string {
+            for (auto& ele : argname_map) {
+              auto it =
+                  std::find(ele.second.begin(), ele.second.end(), node_name);
+              if (it != ele.second.end()) return ele.first;
+            }
+            return "";
+          };
+
+      bool need_correct_place = true;
+
+      std::vector<TargetType> in_types;
+      std::vector<TargetType> out_types;
+      for (auto* x_in : x->inlinks) {
+        std::string node_name = x_in->AsArg().name;
+        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
+        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
+                                   << node_name;
+        VLOG(4) << "-- input arg_name:" << arg_name << " "
+                << "-- node name:" << node_name;
+        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
+        if (!x_in->AsArg().type) {
+          need_correct_place &= false;
+        } else {
+          if (in_types.empty()) {
+            in_types.push_back(x_in->AsArg().type->target());
+          } else {
+            if (in_types[0] != x_in->AsArg().type->target()) {
+              need_correct_place &= false;
+            }
+          }
+        }
+      }
+
+      for (auto* x_out : x->outlinks) {
+        std::string node_name = x_out->AsArg().name;
+        std::string arg_name =
+            get_argname(node_name, inst.op_info()->outputs());
+        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
+                                   << node_name << " in Inst "
+                                   << inst.op_type();
+        VLOG(4) << "-- output arg_name " << arg_name;
+        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
+        if (!x_out->AsArg().type) {
+          need_correct_place &= false;
+        } else {
+          if (out_types.empty()) {
+            out_types.push_back(x_out->AsArg().type->target());
+          } else {
+            if (out_types[0] != x_out->AsArg().type->target()) {
+              need_correct_place &= false;
+            }
+          }
+        }
+      }
+
+      auto this_type = inst.picked_kernel().target();
+      bool io_target_same = (in_types[0] == out_types[0]);
+      need_correct_place &= (io_target_same && (in_types[0] != this_type));
+      if (need_correct_place) {
+        // update this kernel's valid place;
+        UpdateTarget(inst, in_types[0]);
+      }
+    }
+  }
+
+  // Update me's kUnk fields by other's fields.
+  void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) {  // NOLINT
+    auto new_place = inst.place();
+    new_place.target = new_target;
+    std::vector<Place> places;
+    places.push_back(new_place);
+    inst.ResetKernels(places);
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/node.cc b/lite/core/mir/node.cc
index 4a90e530a46c4d42d2ba032da1828973dfc1bcef..52fd39182a7132777231929d49c319bb961cf7f9 100644
--- a/lite/core/mir/node.cc
+++ b/lite/core/mir/node.cc
@@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
   }
   valid_kernels_ = op_->CreateKernels(valid_places);
 }
+void mir::Node::Stmt::ResetKernels(const std::vector<Place> &valid_places) {
+  CHECK(op_) << "change valid place failed, not created op";
+  valid_kernels_.clear();
+  valid_kernels_ = op_->CreateKernels(valid_places);
+}
 
 mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
   auto &x = AsArg();
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index e2c8a68bde6ee18506de73a7531716695b3d54f1..e7c44d2be689a9d890158c097e198314413d1ba3 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -53,6 +53,7 @@ class Node {
                  const std::vector<Place>& valid_places,
                  lite::Scope* scope = nullptr);
 
+    void ResetKernels(const std::vector<Place>& valid_places);
     std::string op_type() const { return op_info()->Type(); }
     const OpInfo* op_info() const;
     OpInfo* mutable_op_info();
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
old mode 100644
new mode 100755
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 26ea72cb25e50110ebbeba52d265236730e2ecdf..bb103647c3f389b304ae7d0aa1089843fa781a0f 100755
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -76,6 +76,7 @@ class Optimizer {
 #endif
            "static_kernel_pick_pass",        // pick original kernel from graph
            "variable_place_inference_pass",  // inference arg/var's
+           "kernel_place_correct_pass",
            // info(target/precision/layout/device)
            // using kernel info
            "argument_type_display_pass",  // debug pass: show arg-type-node's
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 93ea2137a8431db3602ed34b6845a19c45e92b8a..8b1c6687463f3ca04ffb924efff8b814ae711415 100755
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -148,7 +148,7 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
-// LITE_PRECISION_PROFILE(inst)
+    LITE_PRECISION_PROFILE(inst)
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
old mode 100755
new mode 100644
index fa7e2c0c3ae4580f5d19e82f7c48c74db3058847..6c0523ab600ae6352fc4d7716bc2a248d19ea8b5
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,7 +28,6 @@ namespace arm {
 
 void LookupTableCompute::Run() {
   auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
   // inputs
   auto w = param.W;
   auto ids = param.Ids;
@@ -76,3 +75,13 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
old mode 100644
new mode 100755
index 2293267f021d5a7bc003e69f3be84d8205ce2746..06e317c253cb06778162e2fa7ed08456fb4f6f17
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -71,6 +71,13 @@ void ConvCompute::PrepareForRun() {
     if (param.fuse_relu) {
       conv_param.activeParam.type = zynqmp::TYPE_RELU;
     }
+
+    // conv_param.filter->saveToFile("conv_filter_", true);
+    // if (param.bias != nullptr) {
+    //   std::cout << "param.bias != nullptr" << std::endl;
+    //   conv_param.bias()->saveToFile("conv_bias_", true);
+    // }
+
     conv_pe_.init();
     conv_pe_.apply();
   }
@@ -79,26 +86,18 @@ void ConvCompute::PrepareForRun() {
 void ConvCompute::Run() {
   auto& param = this->Param<param_t>();
   if (param.x->ZynqTensor()->shape().channel() != 1 &&
-      param.groups == param.x->ZynqTensor()->shape().channel()) {
+    param.groups == param.x->ZynqTensor()->shape().channel()) {
     dw_conv_pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+    zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param();
+    Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
+#endif
   } else {
-    zynqmp::ConvParam& conv_param = conv_pe_.param();
-
-    if (conv_param.output->shape().channel() == 12 &&
-        conv_param.output->shape().height() == 13) {
-      conv_param.input->saveToFile("conv_in", true);
-      conv_param.output->saveToFile("conv_o", true);
-    }
-
+    // zynqmp::ConvParam& conv_param = conv_pe_.param();
     conv_pe_.dispatch();
 
-    if (conv_param.output->shape().channel() == 12 &&
-        conv_param.output->shape().height() == 13) {
-      // conv_param.input->saveToFile("conv_in", true);
-      conv_param.output->saveToFile("conv_out", true);
-    }
 #ifdef FPGA_PRINT_TENSOR
-    // zynqmp::ConvParam& conv_param = conv_pe_.param();
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
     Debugger::get_instance().registerOutput("conv", conv_param.output);
 #endif
   }
@@ -122,3 +121,17 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::ConvCompute, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
index f2276cabf8445c64ea02a1dbdc761586bc5a1f9b..d22cc7abacc2ecd80e54aa5c62a7e57671b920c9 100755
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -125,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc
old mode 100644
new mode 100755
index add60f64602105d317c3657985c0011aff246608..97b64091bb4cd54c42e721fb1c75d01c331a6ae0
--- a/lite/kernels/fpga/elementwise_compute_test.cc
+++ b/lite/kernels/fpga/elementwise_compute_test.cc
@@ -93,18 +93,22 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
   }
   // do elementwise add/sub/max...
   if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = sum(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
+    // for (int i = 0; i < batch; ++i) {
+    //   for (int j = 0; j < channels; ++j) {
+    //     int offset = (i * channels + j) * num;
+    //     const dtype* din_ptr = x_data + offset;
+    //     const dtype diny_data = y_data[j];
+    //     dtype* dout_ptr = out_data + offset;
+    //     for (int k = 0; k < num; ++k) {
+    //       *dout_ptr = zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr), zynqmp::half_to_float(diny_data)));
+    //       dout_ptr++;
+    //       din_ptr++;
+    //     }
+    //   }
+    // }
+    int count= x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
+    for (int i = 0; i < count; ++i) {
+          out_data[i] = zynqmp::float_to_half(sum(zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i])));
     }
   } else if (elt_type == "sub") {
     for (int i = 0; i < batch; ++i) {
@@ -148,9 +152,9 @@ TEST(elementwise_add, compute) {
   lite::Tensor x, y, output, output_ref;
 
   for (auto n : {1}) {
-    for (auto c : {8}) {
-      for (auto h : {8}) {
-        for (auto w : {8}) {
+    for (auto h : {72}) {
+      for (auto w : {192}) {
+        for (auto c : {24}) {
           for (auto axis : {0}) {
             for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
               auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
@@ -174,10 +178,16 @@ TEST(elementwise_add, compute) {
               auto* output_ref_data =
                   output_ref.mutable_data<float16>(TARGET(kFPGA));
               for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = zynqmp::float_to_half(i);
+                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                float x = sign * (i % 128);
+                std::cout << "x:" << x << std::endl;
+                x_data[i] = zynqmp::float_to_half(x);
               }
               for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = zynqmp::float_to_half(i);
+                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                float y = sign * (i % 128);
+                std::cout << "y:" << y << std::endl;
+                y_data[i] = zynqmp::float_to_half(y);
               }
               param.X = &x;
               param.Y = &y;
@@ -190,7 +200,8 @@ TEST(elementwise_add, compute) {
 
               elementwise_compute_ref<float16>(param, "add", "");
               for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+                std::cout << "output_data:" << zynqmp::half_to_float(output_data[i]) << ",output_ref_data:" << zynqmp::half_to_float(output_ref_data[i]) << std::endl;
+                EXPECT_NEAR(zynqmp::half_to_float(output_data[i]), zynqmp::half_to_float(output_ref_data[i]), 1e-5);
               }
             }
           }
@@ -209,73 +220,73 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
   ASSERT_TRUE(fusion_elementwise_add_activation.front());
 }
 
-TEST(fusion_elementwise_add_activation_fpga, init) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
-  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
-}
+// TEST(fusion_elementwise_add_activation_fpga, init) {
+//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+//   ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
+//   ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
+// }
 
-TEST(fusion_elementwise_add_activation_fpga, compute) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
+// TEST(fusion_elementwise_add_activation_fpga, compute) {
+//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+//   operators::FusionElementwiseActivationParam param;
+//   lite::Tensor x, y, output, output_ref;
 
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1}) {
-      for (auto c : {8}) {
-        for (auto h : {8}) {
-          for (auto w : {8}) {
-            for (auto axis : {0}) {
-              for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+//   for (auto act_type : {"relu"}) {
+//     for (auto n : {1}) {
+//       for (auto c : {8}) {
+//         for (auto h : {8}) {
+//           for (auto w : {8}) {
+//             for (auto axis : {0}) {
+//               for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
+//                 auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+//                 auto y_dim = DDim(yd);
+//                 int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
 
-                if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
-                if (flag) continue;
+//                 if (axis_t + y_dim.size() > 4) continue;
+//                 bool flag = false;
+//                 for (int i = 0; i < y_dim.size(); i++) {
+//                   if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+//                 }
+//                 if (flag) continue;
 
-                x.Resize(x_dim);
-                y.Resize(y_dim);
-                output.Resize(x_dim);
-                output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-                auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_ref_data =
-                    output_ref.mutable_data<float16>(TARGET(kFPGA));
-                for (int i = 0; i < x_dim.production(); i++) {
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                  x_data[i] = zynqmp::float_to_half(i * sign);
-                }
-                for (int i = 0; i < y_dim.production(); i++) {
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = zynqmp::float_to_half(i * sign);
-                }
-                param.X = &x;
-                param.Y = &y;
-                param.axis = axis;
-                param.Out = &output;
-                param.act_type = act_type;
-                fusion_elementwise_add_activation.SetParam(param);
-                fusion_elementwise_add_activation.PrepareForRun();
-                fusion_elementwise_add_activation.Run();
-                param.Out = &output_ref;
-                elementwise_compute_ref<float16>(param, "add", act_type);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
+//                 x.Resize(x_dim);
+//                 y.Resize(y_dim);
+//                 output.Resize(x_dim);
+//                 output_ref.Resize(x_dim);
+//                 auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* output_ref_data =
+//                     output_ref.mutable_data<float16>(TARGET(kFPGA));
+//                 for (int i = 0; i < x_dim.production(); i++) {
+//                   float sign = i % 3 == 0 ? -1.0f : 1.0f;
+//                   x_data[i] = zynqmp::float_to_half(i * sign);
+//                 }
+//                 for (int i = 0; i < y_dim.production(); i++) {
+//                   float sign = i % 2 == 0 ? 0.5f : -0.5f;
+//                   y_data[i] = zynqmp::float_to_half(i * sign);
+//                 }
+//                 param.X = &x;
+//                 param.Y = &y;
+//                 param.axis = axis;
+//                 param.Out = &output;
+//                 param.act_type = act_type;
+//                 fusion_elementwise_add_activation.SetParam(param);
+//                 fusion_elementwise_add_activation.PrepareForRun();
+//                 fusion_elementwise_add_activation.Run();
+//                 param.Out = &output_ref;
+//                 elementwise_compute_ref<float16>(param, "add", act_type);
+//                 for (int i = 0; i < output.dims().production(); i++) {
+//                   EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+//                 }
+//               }
+//             }
+//           }
+//         }
+//       }
+//     }
+//   }
+// }
 
 }  // namespace fpga
 }  // namespace kernels
@@ -283,4 +294,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) {
 }  // namespace paddle
 
 USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
+// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
index 57a76dee97ca889cd645a2c8f81b5a2354f9b11f..4554c24e07de656b948826c2fa6f9526f61daaa6 100755
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -191,8 +191,6 @@ class IoCopyFpgaToHostCHWCompute
     param.y->ZynqTensor()->flush();
     auto out_lod = param.y->mutable_lod();
     *out_lod = param.x->lod();
-    // param.x->ZynqTensor()->saveToFile("io_x", true);
-    // param.y->ZynqTensor()->saveToFile("io_y", true);
   }
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
index c889df17cb72a6d3e8ab02efc729ecc93fb38a5f..afd14ccb4b4a9a4f1e93e1e38840035fb18186bb 100644
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -78,7 +78,6 @@ void PriorBoxCompute::PrepareForRun() {
 
   param.boxes->mutable_data<float>();
   param.variances->mutable_data<float>();
-
   zynqmp::PriorBoxParam& priobox_param = pe_.param();
   priobox_param.input = param.input->ZynqTensor();
   priobox_param.image = param.image->ZynqTensor();
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/host/reshape_compute.cc b/lite/kernels/host/reshape_compute.cc
index 7a826ed32b02a85860038482d8ca55c5db32a9bf..10c50d20b9c52f72d09c4519716e2defb047a23f 100644
--- a/lite/kernels/host/reshape_compute.cc
+++ b/lite/kernels/host/reshape_compute.cc
@@ -63,26 +63,6 @@ REGISTER_LITE_KERNEL(reshape,
                                        DATALAYOUT(kAny))})
     .Finalize();
 
-// REGISTER_LITE_KERNEL(reshape,
-//                      kFPGA,
-//                      kFP16,
-//                      kNHWC,
-//                      paddle::lite::kernels::host::ReshapeCompute,
-//                      def)
-//     .BindInput("X",
-//                {LiteType::GetTensorTy(
-//                    TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
-//     .BindInput("ShapeTensor",
-//                {LiteType::GetTensorTy(
-//                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
-//     .BindInput("Shape",
-//                {LiteType::GetTensorTy(
-//                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
-//     .BindOutput("Out",
-//                 {LiteType::GetTensorTy(
-//                     TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
-//     .Finalize();
-
 REGISTER_LITE_KERNEL(reshape2,
                      kHost,
                      kAny,
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
old mode 100644
new mode 100755
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
old mode 100644
new mode 100755
diff --git a/lite/operators/fake_quantize_range_abs_max.cc b/lite/operators/fake_quantize_range_abs_max.cc
index a8ce3f75a59fec5b032c60f51177f428bd15fe0d..ebf7e41f4b1af6f6961da07fe95caece19fa59f5 100644
--- a/lite/operators/fake_quantize_range_abs_max.cc
+++ b/lite/operators/fake_quantize_range_abs_max.cc
@@ -23,3 +23,5 @@ namespace operators {}  // namespace operators
 
 REGISTER_LITE_OP(fake_quantize_range_abs_max,
                  paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
+REGISTER_LITE_OP(fake_quantize_abs_max,
+                 paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
diff --git a/lite/operators/fake_quantize_range_abs_max.h b/lite/operators/fake_quantize_range_abs_max.h
index 726731595a9c4b7cd2e30db911230cc2f00b5b92..f68d1e20f6e60bb5aa99a2402ea8c9f88aa18470 100644
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
@@ -40,13 +40,15 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
-    auto in_scale = op_desc.Input("InScale").front();
+    if (op_desc.HasInput("InScale")) {
+      auto in_scale = op_desc.Input("InScale").front();
+      param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
+    }
 
     auto out = op_desc.Output("Out").front();
     auto out_scale = op_desc.Output("OutScale").front();
 
     param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
 
     param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
     param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 6121186e7c983145f2f9f450f6a23ea1957bb496..e1610b60d3b1b104699ab175bca3bb3cf81bd40b 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 
 
 # global variables
-BUILD_EXTRA=ON
+BUILD_EXTRA=OFF
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
diff --git a/mobile/src/fpga/KD/pes/conv_pe.hpp b/mobile/src/fpga/KD/pes/conv_pe.hpp
old mode 100644
new mode 100755
index 5ef89e920e60cd2ef1c57e1f342a342a4149563f..388672a99325c2d04d87c90fa5a6b556b676a820
--- a/mobile/src/fpga/KD/pes/conv_pe.hpp
+++ b/mobile/src/fpga/KD/pes/conv_pe.hpp
@@ -29,7 +29,6 @@ namespace zynqmp {
 class ConvPE : public PE {
  public:
   bool init() {
-    std::cout << "Conv init" << std::endl;
     return true;
   }