diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100644
new mode 100755
index e3f7a211d70920aa74765b976af6939d55a328ab..3616823985bffb9d53615a031759c701d4b2ff09
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,9 @@ if (WITH_PADDLE_MOBILE)
     return()
 endif(WITH_PADDLE_MOBILE)
 
+# set(CMAKE_BUILD_TYPE DEBUG)
+
+
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 70f483822ac484576fe6934c0a30e85593e1e93a..8ef2257f17465be8e6ac92a842862ac68e45f765 100755
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -223,6 +223,25 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels})
 
+    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
+
+    lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
+
+    lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
+            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
+    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
+   lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
+      DEPS ${lite_model_test_DEPS})
+
    # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
    #    DEPS ${lite_model_test_DEPS}
    #    CL_DEPS ${opencl_kernels}
diff --git a/lite/api/inceptionv3_test_fpga.cc b/lite/api/inceptionv3_test_fpga.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca3807124e69197bb9c6c62385a7d7e4f30449c7
--- /dev/null
+++ b/lite/api/inceptionv3_test_fpga.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+#ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  predictor.Build("",
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+}
+#endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/api/ocr_attention_test_fpga.cc b/lite/api/ocr_attention_test_fpga.cc
new file mode 100755
index 0000000000000000000000000000000000000000..326de883d1625f7196426094cc4ccec970f8a399
--- /dev/null
+++ b/lite/api/ocr_attention_test_fpga.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_file, "", "input_file");
+
+namespace paddle {
+namespace lite {
+
+void read_from_file(const std::string& path, float* data, int num) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    exit(-1);
+    return;
+  }
+
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+}
+
+void chw_to_hwc(float* src, float* dst, int channel, int height, int width) {
+  int amount_per_row = width * channel;
+  int index = 0;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        int dst_index = offset_height + w * channel + c;
+        dst[dst_index] = src[index];
+        index = index + 1;
+      }
+    }
+  }
+}
+
+void TestModel(const std::vector<Place>& valid_places,
+               const Place& preferred_place,
+               bool use_npu = false) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+
+  // predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
+  predictor.Build("", "attention/model", "attention/params", valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 100, 200})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  read_from_file(FLAGS_input_file, data, 100 * 200);
+  //=============================================
+  auto* init_ids = predictor.GetInput(1);
+  init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_ids = init_ids->mutable_data<float>();
+  auto ids_size = init_ids->dims().production();
+  for (int i = 0; i < ids_size; i++) {
+    data_ids[i] = 0;
+  }
+  auto lod_ids = init_ids->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
+  *lod_ids = lod_i;
+
+  //=============================================
+  auto* init_scores = predictor.GetInput(2);
+  init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_scores = init_scores->mutable_data<float>();
+  auto scores_size = input_tensor->dims().production();
+  for (int i = 0; i < scores_size; i++) {
+    data_scores[i] = 0;
+  }
+  auto lod_scores = init_scores->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
+  *lod_scores = lod_s;
+
+  //=============================================
+  auto* position_encoding = predictor.GetInput(3);
+  position_encoding->Resize(
+      DDim(std::vector<DDim::value_type>({1, 33, 10, 23})));
+  auto* position_encoding_data = position_encoding->mutable_data<float>();
+
+  float* temp_data = position_encoding_data;
+
+  for (int i = 0; i < position_encoding->dims().production(); ++i) {
+    temp_data[i] = 0;
+  }
+  int index = 0;
+  for (int i = 0; i < 10; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == row) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  for (int i = 0; i < 23; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == col) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  // chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23);
+  // delete[] temp_data;
+
+  // read_from_file("position_encoding.data", position_encoding_data, 33 * 10 *
+  // 23);
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  std::cout << "================== Speed Report ===================";
+  std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  auto* out = predictor.GetOutput(0);
+
+  std::string file = "plate_data/" + FLAGS_input_file.substr(9);
+  std::cout << "file:::" << file << std::endl;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+TEST(OcrAttention, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)});
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..cbc65e41e2912df10fca00169cdc64ea832e7d03 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <unordered_map>
 
@@ -22,7 +24,7 @@
 namespace paddle {
 namespace lite {
 
-#define FPGA_PRINT_TENSOR
+// #define FPGA_PRINT_TENSOR
 
 class Debugger {
  public:
@@ -37,25 +39,42 @@ class Debugger {
     }
   }
 
+  void tick(std::string key) {
+    float value = 0;
+    if (tick_tock_map.count(key) > 0) {
+      value += tick_tock_map[key] = value;
+    }
+  }
+
+  void tock(std::string key) {}
+
+  void setEnable(bool en) { enabled_ = en; }
+
  private:
+  bool enabled_ = false;
+
   std::unordered_map<std::string, bool> op_config;
+  std::unordered_map<std::string, float> tick_tock_map;
   Debugger() {
     op_config["concat"] = true;
     op_config["pooling"] = true;
     op_config["conv"] = true;
+    op_config["dropout"] = true;
     op_config["dwconv"] = true;
     op_config["ew_add"] = true;
+    op_config["ew_mul"] = true;
     op_config["crop"] = true;
     op_config["feed"] = true;
-    op_config["mul"] = true;
     op_config["fetch"] = true;
+    op_config["fc"] = true;
+    op_config["mul"] = true;
     op_config["boxes"] = true;
     op_config["scores"] = true;
     op_config["nms"] = true;
     op_config["pb_boxes"] = true;
     op_config["pb_variances"] = true;
-    // op_config["fc"] = true;
     op_config["softmax"] = true;
+    op_config["split"] = true;
   }
 };
 
diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp
index eddf5ca454cdc9e91f87d6e4f2c8dfc13f35fdc6..fe66c84143fbc05f0b52a11e2e315b7f3db9054c 100755
--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
diff --git a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl b/lite/backends/fpga/KD/io.cpp
similarity index 63%
rename from lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
rename to lite/backends/fpga/KD/io.cpp
index 615bf892b321ba67043d41f6032caa758d78c16f..868287d937bf73a6e34af736b4f94a29f34036ca 100644
--- a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
+++ b/lite/backends/fpga/KD/io.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cl_common.h>
+#include "io.hpp"
 
-__kernel void sigmoid(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) {
-  const int index = get_global_id(0); 
-  if (index < count) {
-    out_data[index] = 1 / (1 + exp(-x_data[index]));
-  }
-}
+namespace paddle {
+namespace zynqmp {
+
+
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/io.hpp b/lite/backends/fpga/KD/io.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..874226bcfed506f10e5e092451b3fe291e3fc6c8
--- /dev/null
+++ b/lite/backends/fpga/KD/io.hpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdio.h>
+
+namespace paddle {
+namespace zynqmp {
+
+class FpgaIO {
+ public:
+  static FpgaIO& get_instance() {
+    static FpgaIO s_instance;
+    return s_instance;
+  }
+
+  void allocData(size_t s) { data_ = new float[s]; }
+
+  float* getData() { return data_; }
+
+ private:
+  float* data_ = nullptr;
+
+  FpgaIO();
+};
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
index da81565cf5ca152a54b6cc1514cb660589428439..e09b9d67d1263278abcd84d6ab9d7e392ee94b48 100755
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -240,8 +240,10 @@ int8_t* format_filter(float* data_in,
   for (int n = 0; n < num; n++) {
     float* filter_start = data_in + n * chw;
     int8_t* quantized_start = quantized_data + n * chw;
-    quantize(filter_start, quantized_start, chw, max);
-    filter_max.push_back(1);
+    // float f_max = find_max(filter_start, chw);
+    float f_max = max;
+    quantize(filter_start, quantized_start, chw, f_max);
+    filter_max.push_back(f_max);
   }
 
   int8_t* hwc_data =
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
index bcbf2b98f487aea3c6516fa6369e70d11be97ffc..f8dc1e69627dd039d130a19f224c14eb04e0be92 100644
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -204,7 +204,7 @@ int get_device_info(const struct DeviceInfo &args) {
 int perform_bypass(const struct BypassArgs &args) {
   int ret = -1;
   int size = args.image.channels * args.image.width * args.image.height;
-  int max_size = 1 << 21;
+  int max_size = 1 << 20;
 
   float times = 1.0 * size / max_size;
   int count = static_cast<int>(times);
diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp
index 42ec32957e5884aaae3cc96f46060de114b44ead..222a788d351d9b3dd2cde7c595af898602990ea3 100644
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -83,26 +83,34 @@ struct ConvParam : PEParam {
   std::vector<int> kernelSize;
   std::vector<int> dilations;
 
-  Tensor* scale() { return scale_; }
+  Tensor* scale() { return &scale_; }
 
-  Tensor* bias() { return bias_; }
+  Tensor* bias() { return &bias_; }
 
   std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
 
+  ~ConvParam() {
+    for (int i = 0; i < splitParams_.size(); i++) {
+      BasicConvParam* basic_param = splitParams_[i];
+      delete basic_param;
+    }
+    splitParams_.clear();
+  }
+
  protected:
   std::vector<BasicConvParam*> splitParams_;
-  Tensor* scale_ = new Tensor();
-  Tensor* bias_ = new Tensor();
+  Tensor scale_;
+  Tensor bias_;
 };
 
 struct DepthwiseConvParam : ConvParam {
  public:
-  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* quantizedFilter() { return &quantizedFilter_; }
 
   DWconvArgs args;
 
  protected:
-  Tensor* quantizedFilter_ = new Tensor();
+  Tensor quantizedFilter_;
 };
 
 enum PoolingType : int {
@@ -142,7 +150,7 @@ struct ElementwiseAddParam : PEParam {
 
 struct ElementwiseMulParam : PEParam {
  public:
-  Tensor* input_x;
+  Tensor* input_x = nullptr;
   Tensor* input_y = nullptr;
   Tensor* output = nullptr;
 };
@@ -154,13 +162,13 @@ struct FullyConnectedParam : PEParam {
   Tensor* bias = nullptr;
   Tensor* output = nullptr;
 
-  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* quantizedFilter() { return &quantizedFilter_; }
 
-  Tensor* biasScale() { return biasScale_; }
+  Tensor* biasScale() { return &biasScale_; }
 
  protected:
-  Tensor* quantizedFilter_ = new Tensor();
-  Tensor* biasScale_ = new Tensor();
+  Tensor quantizedFilter_;
+  Tensor biasScale_;
 };
 
 struct SoftmaxParam : PEParam {
@@ -193,10 +201,10 @@ struct NormParam : PEParam {
 };
 
 struct PriorBoxParam : PEParam {
-  Tensor* input;
-  Tensor* image;
-  Tensor* outputBoxes;
-  Tensor* outputVariances;
+  Tensor* input = nullptr;
+  Tensor* image = nullptr;
+  Tensor* outputBoxes = nullptr;
+  Tensor* outputVariances = nullptr;
 
   std::vector<float> minSizes;
   std::vector<float> maxSizes;
@@ -212,10 +220,10 @@ struct PriorBoxParam : PEParam {
 };
 
 struct YoloBoxParam : PEParam {
-  Tensor* input;
-  Tensor* imgSize;
-  Tensor* outputBoxes;
-  Tensor* outputScores;
+  Tensor* input = nullptr;
+  Tensor* imgSize = nullptr;
+  Tensor* outputBoxes = nullptr;
+  Tensor* outputScores = nullptr;
   int downsampleRatio;
   std::vector<int> anchors;
   int classNum;
@@ -229,15 +237,15 @@ struct ScaleParam : PEParam {
   Tensor* scale = nullptr;
   Tensor* bias = nullptr;
 
-  Tensor* alignedScale() { return alignedScale_; }
+  Tensor* alignedScale() { return &alignedScale_; }
 
-  Tensor* alignedBias() { return alignedBias_; }
+  Tensor* alignedBias() { return &alignedBias_; }
 
   ScaleArgs args = {0};
 
  protected:
-  Tensor* alignedScale_ = new Tensor();
-  Tensor* alignedBias_ = new Tensor();
+  Tensor alignedScale_;
+  Tensor alignedBias_;
 };
 
 struct ResizeParam : PEParam {
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
index b4eac2c41e138cab19197ccb8ab89681a69ec6fe..4c5da08a4b1f5ae02965b03da8ff95c09a721f5c 100644
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -212,6 +212,8 @@ class ConvPE : public PE {
 
   ConvParam& param() { return param_; }
 
+  ~ConvPE() {}
+
  private:
   bool use_cpu_ = false;
   bool split_channel = false;
diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
index a2b184e383aa600b1279197a115c58309e204a95..01e16a454af8f14c06b7d62fbefe9b29cfef2850 100644
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -38,7 +38,7 @@ class FullyConnectedPE : public PE {
     Tensor* input = param_.input;
     convParam_.input = param_.input;
     convParam_.output = param_.output;
-    // convParam_.relu = param_.relu;
+
     convParam_.activeParam.type = param_.activeParam.type;
     convParam_.groups = 1;
     convParam_.strides = {1, 1};
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
index 2944691693b135a2d2df7b91ecbe0ef249b015d8..2d02d30fbae12efc372e58c2ad80348356a8f22d 100755
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
 
@@ -52,6 +53,12 @@ class OutputPE : public PE {
     memcpy(DLEngine::get_instance().out_data,
            output->data<void>(),
            output->shape().numel() * sizeof(float));
+
+    fpga_reset();
+
+    auto max = fpga_get_memory_size_max();
+    std::cout << "PL ===== Max: ===== :: " << max << std::endl;
+
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
index 00dfe1830f6f44cbf6a30708fa5783563470c686..d7d58ee8b7e23de843143b643eda0272c4cfc34b 100644
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -241,10 +241,13 @@ void PriorBoxPE::compute_prior_box() {
   }
 
   boxes.flush();
-  boxes.syncToCPU();
+  // boxes.syncToCPU();
   variances.flush();
   output_boxes->copyFrom(&boxes);
   output_variances->copyFrom(&variances);
+
+  output_boxes->invalidate();
+  output_variances->invalidate();
 }
 
 void PriorBoxPE::apply() {}
@@ -253,8 +256,9 @@ bool PriorBoxPE::dispatch() {
   if (cachedBoxes_ == nullptr) {
     cachedBoxes_ = new Tensor();
     cachedVariances_ = new Tensor();
-    cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
-    cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
+    cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
+    cachedVariances_->mutableData<float16>(FP16,
+                                           param_.outputVariances->shape());
     cachedBoxes_->setDataLocation(CPU);
     cachedVariances_->setDataLocation(CPU);
     compute_prior_box();
diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp
index 5c125010c27615c545ba274b259f18c775db3d55..dfc70867735b18f10970864888eca88c7f2dc56e 100755
--- a/lite/backends/fpga/KD/pes/relu_pe.hpp
+++ b/lite/backends/fpga/KD/pes/relu_pe.hpp
@@ -23,43 +23,27 @@ class ReluPE : public PE {
  public:
   bool init() {
     Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
+    output->setAligned(param_.input->aligned());
+    output->setDataLocation(CPU);
     return true;
   }
 
-  void apply() {
-    Tensor* src = param_.input;
-
-    args_.input_data_type = DATA_TYPE_FP16;
-    args_.output_data_type = DATA_TYPE_FP16;
-    args_.input_layout_type = LAYOUT_HWC;
-    args_.output_layout_type = LAYOUT_HWC;
-    args_.image = {.address = src->data<void>(),
-                   .scale_address = src->scale(),
-                   .channels = (uint32_t)src->shape().channel(),
-                   .width = (uint32_t)src->shape().width(),
-                   .height = (uint32_t)src->shape().height(),
-                   .pad_width = 0u,
-                   .pad_height = 0u};
-    args_.output = {
-        .address = param_.output->data<void>(),
-        .scale_address = param_.output->scale(),
-    };
-
-    inplace_.relu_enable = false;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-  }
+  void apply() {}
 
   bool dispatch() {
-    inplace_.relu_enable = true;
-    config_inplace(inplace_);
-    param_.input->syncToDevice();
-    param_.output->copyFrom(param_.input);
-    param_.output->invalidate();
-    inplace_.relu_enable = false;
-    config_inplace(inplace_);
+    param_.input->invalidate();
+    int16_t* input_data = param_.input->data<int16_t>();
+    float16* out_data = param_.output->data<float16>();
+    for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) {
+      int16_t v = param_.input->data<float16>()[i];
+      if (v > 0) {
+        out_data[i] = input_data[i];
+      } else {
+        out_data[i] = zero;
+      }
+    }
+    param_.output->copyScaleFrom(param_.input);
+    param_.output->flush();
     return true;
   }
 
@@ -67,8 +51,7 @@ class ReluPE : public PE {
 
  private:
   InputParam param_;
-  BypassArgs args_;
-  InplaceArgs inplace_;
+  float16 zero = float_to_half(0.0f);
 };
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
index 09755c65a322da8ccab0d57dd2e877712b112361..5ff94edd747fe9f01741baf1efaad288ed32b98d 100755
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -36,6 +36,7 @@ class ScalePE : public PE {
   }
 
   inline int lcm(int a, int b) { return a * b / gcd(a, b); }
+
   bool init() {
     Tensor* output = param_.output;
     output->setAligned(true);
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 988bc1bb507036de8f13a6c6549c549718bd1256..19f8f3b2500002a1329f433de0474f512931e6e8 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -103,12 +103,14 @@ class Tensor {
     return reinterpret_cast<Dtype*>(ptr);
   }
 
+  void releaseData() {
+    released = true;
+    placeHolder_.reset();
+  }
+
   template <typename Dtype>
   Dtype* mutableData(DataType dataType, const Shape& shape) {
-    if (this->shape_ != nullptr) {
-      delete shape_;
-    }
-    this->shape_ = new Shape(shape);
+    this->shape_.reset(new Shape(shape));
     this->dataType_ = dataType;
     return mutableData<Dtype>();
   }
@@ -138,7 +140,7 @@ class Tensor {
 
   DataType dataType() { return this->dataType_; }
 
-  Shape& shape() { return *shape_; }
+  Shape& shape() { return *(shape_.get()); }
 
   bool aligned() { return this->aligned_; }
 
@@ -247,15 +249,12 @@ class Tensor {
   void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
 
   void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
-    if (shape_ != nullptr) {
-      delete shape_;
-    }
     this->placeHolder_ = src->placeHolder_;
     this->dataType_ = src->dataType_;
     this->aligned_ = src->aligned_;
     this->dateLocation_ = src->dateLocation_;
     this->offset = offset;
-    shape_ = new Shape(const_cast<Shape&>(shape));
+    shape_.reset(new Shape(shape));
   }
 
   void copyFrom(Tensor* src) {
@@ -284,7 +283,6 @@ class Tensor {
         .address = data<void>(), .scale_address = scale(),
     };
     args.output = output;
-    src->syncToDevice();
     size_t aligned_remainder = src->shape().numel() % 16;
     if (aligned_remainder > 0) {
       size_t dtype_size =
@@ -294,12 +292,14 @@ class Tensor {
       fpga_flush(dst, aligned_remainder * dtype_size);
     }
     src->syncToDevice();
-    this->invalidate();
     perform_bypass(args);
     this->invalidate();
   }
 
   void flush() {
+    if (released) {
+      return;
+    }
     size_t memorySize = placeHolder_->memorySize();
     fpga_flush(placeHolder_->data(), memorySize);
   }
@@ -380,7 +380,6 @@ class Tensor {
   }
 
   void save_file_with_name(std::string path) {
-    invalidate();
     std::ofstream ofs;
     ofs.open(path);
     ofs << scale()[0] << " / " << scale()[1] << std::endl;
@@ -389,11 +388,17 @@ class Tensor {
       float value = 0;
       if (dataType_ == FP32) {
         value = data<float>()[i];
-      } else if (dataType_ == FP16) {
+      }
+      if (dataType_ == FP16) {
         value = half_to_float(data<float16>()[i]);
-      } else {
+      }
+
+      if (dataType_ == INT8) {
         value = data<int8_t>()[i];
       }
+      if (dataType_ == INT32) {
+        value = data<int32_t>()[i];
+      }
       ofs << value << std::endl;
     }
     ofs.close();
@@ -451,18 +456,12 @@ class Tensor {
     return os;
   }
 
-  ~Tensor() {
-    if (shape_ != nullptr) {
-      delete shape_;
-      shape_ = nullptr;
-    }
-  }
-
  private:
+  bool released = false;
   int offset = 0;
   float mem_scale_factor_ = 1.0f;
   std::shared_ptr<PlaceHolder> placeHolder_;
-  Shape* shape_ = nullptr;
+  std::shared_ptr<Shape> shape_;
   DataType dataType_ = FP32;
   bool aligned_ = false;
   DataSyncStatus synchedStatus_ = Synched;
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
index 7f1e8d3e17f97315e77532b77bbcfcc8331edd4f..53086404955b2b477532f2e73250005807183c87 100755
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -69,7 +69,7 @@ std::string DDimLite::repr() const {
 }
 
 void TensorLite::ShareDataWith(const TensorLite &other) {
-  buffer_ = other.buffer_;
+  buffer_ = other.buffer_;  // TODO(chonwhite) delete buffer;
   dims_ = other.dims_;
   zynq_tensor_ = other.zynq_tensor_;
   target_ = other.target_;
@@ -79,10 +79,8 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
 }
 
 void *TensorLite::mutable_data(size_t memory_size) {
-  memory_size_ = memory_size;
+  memory_size_ = memory_size;  // TODO(chonwhite) delete buffer;
   buffer_->ResetLazy(target_, memory_size_);
-  // throw -1;
-  std::cout << memory_size << std::endl;
   return buffer_->data();
 }
 
@@ -95,13 +93,20 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   dims_ = other.dims_;
   target_ = other.target_;
   lod_ = other.lod_;
-  auto dt = zynq_tensor_->dataType();
 
-  auto shape = other.zynq_tensor_->shape();
+  if (zynq_tensor_.get() == nullptr) {
+    zynq_tensor_.reset(new zynqmp::Tensor());
+  }
 
+  auto dt = zynq_tensor_->dataType();
   Resize(other.dims());
+  auto shape = other.zynq_tensor_->shape();
   zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
-  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+
+  // this->ZynqTensor()->copyFrom(other.ZynqTensor());
+  memcpy(this->ZynqTensor()->data<void>(),
+         other.ZynqTensor()->data<void>(),
+         other.ZynqTensor()->shape().numel() * sizeof(float));
 }
 
 }  // namespace lite
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index 266e0b5ce0ea03108978c3b0a32fbf0e3872c83c..3574d466e9ad88b1cd0de55751f8c519b967045a 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -81,6 +81,8 @@ class DDimLite {
     return !(a == b);
   }
 
+  ~DDimLite() {}
+
  private:
   std::vector<value_type> data_;
 };
@@ -142,7 +144,9 @@ class TensorLite {
   void *mutable_data(size_t memory_size);
   void *mutable_data(TargetType target, size_t memory_size);
 
-  const void *raw_data() const { return buffer_->data(); }
+  const void *raw_data() const {
+    return buffer_->data();
+  }  // TODO(chonwhite) delete buffer;
 
   size_t data_size() const { return this->dims().production(); }
 
@@ -150,17 +154,19 @@ class TensorLite {
 
   size_t offset() const { return offset_; }
 
-  bool IsInitialized() const { return buffer_->data(); }
-  void clear() {
-    buffer_->Free();
-    offset_ = 0;
-  }
+  bool IsInitialized() const {
+    return buffer_->data();
+  }  // TODO(chonwhite) delete buffer;
 
   // Other share data to this.
   void ShareDataWith(const TensorLite &other);
 
   void CopyDataFrom(const TensorLite &other);
 
+  void clear() {
+    // zynq_tensor_->releaseData();
+  }
+
   template <typename T>
   TensorLite Slice(int64_t begin, int64_t end) const;
 
@@ -169,7 +175,10 @@ class TensorLite {
 
   TargetType target() const { return target_; }
 
-  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
+  // template <typename T>
+  // TensorLite Slice(int64_t begin, int64_t end) const;
+
+  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); }
 
   friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
     os << "Tensor:" << '\n';
@@ -198,12 +207,34 @@ class TensorLite {
   size_t memory_size_{};
   size_t offset_{0};
 
-  zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
+  std::shared_ptr<zynqmp::Tensor> zynq_tensor_;
 
   template <typename T>
   void mutable_data_internal();
 };
 
+template <typename T>
+zynqmp::DataType get_date_type() {
+  zynqmp::DataType data_type = zynqmp::FP32;
+  if (typeid(T) == typeid(float)) {
+    data_type = zynqmp::FP32;
+  }
+  if (typeid(T) == typeid(zynqmp::float16)) {
+    data_type = zynqmp::FP16;
+  }
+  if (typeid(T) == typeid(int)) {
+    data_type = zynqmp::INT32;
+  }
+  if (typeid(T) == typeid(int32_t)) {
+    data_type = zynqmp::INT32;
+  }
+  if (typeid(T) == typeid(int8_t)) {
+    data_type = zynqmp::INT8;
+  }
+
+  return data_type;
+}
+
 template <typename T, typename R>
 R *TensorLite::mutable_data() {
   std::vector<int> v;
@@ -229,14 +260,12 @@ R *TensorLite::mutable_data() {
       break;
   }
   zynqmp::Shape input_shape(layout_type, v);
+  zynqmp::DataType data_type = get_date_type<T>();
 
-  zynqmp::DataType data_type = zynqmp::FP32;
-  if (typeid(T) == typeid(float)) {
-    data_type = zynqmp::FP32;
-  }
-  if (typeid(T) == typeid(zynqmp::float16)) {
-    data_type = zynqmp::FP16;
+  if (zynq_tensor_.get() == nullptr) {
+    zynq_tensor_.reset(new zynqmp::Tensor());
   }
+
   return zynq_tensor_->mutableData<R>(data_type, input_shape);
 }
 
@@ -276,6 +305,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
 
 template <typename T>
 void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
+  // TODO(chonwhite) delete this function;
   CHECK_GE(begin, 0);
   CHECK_LE(end, dims_[0]);
   CHECK_LT(begin, end);
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index b688bbc1083a6ab0f521381c4a988a12badc3141..a3b90f7d1040b4d878db784c44d578dc37581d42 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -25,7 +25,7 @@ namespace mir {
 void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::vector<std::string> act_types{"relu"};
   for (auto& place : graph->valid_places()) {
-    if (place.target == TARGET(kCUDA)) {
+    if (place.target == TARGET(kCUDA) || place.target == TARGET(kFPGA)) {
       act_types.push_back("leaky_relu");
       break;
     }
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index 0327a50af3387588cf067c637762b625cadfc58f..754bfe142e59d066b936c9337d59c56fbf55eba5 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -103,6 +103,12 @@ void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph,
 
   // obtain values, save values and relink node
   int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
+  int range = ((1 << (bit_length - 1)) - 1);
+  auto* scope = quant_node->stmt()->op()->scope();
+  auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float scale_value = scale_tensor->data<float>()[0] / range;
+
   auto outlinks = output_act_node->outlinks;
   for (auto* quantized_node : outlinks) {
     auto* op_desc = quantized_node->stmt()->mutable_op_info();
@@ -208,9 +214,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   for (int i = 0; i < weight_scale_size; i++) {
     weight_scale.push_back(whole_weight_scale);
   }
+
 #ifndef LITE_WITH_FPGA
   op_desc.SetAttr("enable_int8", true);
 #endif
+
   if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
     op_desc.SetAttr("input_scale", input_scale);
   }
@@ -689,13 +697,16 @@ void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     float* temp_data = temp_tensor.mutable_data<float>();
     size_t weight_num = quantized_weight_t->data_size();
     quantized_weight_t->set_persistable(true);
+
     std::cout << "DynamicQuantDequantOpFuser::InsertNewNode===================="
                  "========================================"
               << std::endl;
+
 #ifdef LITE_WITH_FPGA
     float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
     for (size_t i = 0; i < weight_num; i++) {
       quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
+
       std::cout << whole_weight_scale << "," << temp_data[i] << ","
                 << quantized_weight_data[i] << std::endl;
     }
diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h
index 35cd2e6ef7e4a82335765e89bb6f80df07e4f903..71c6ea92737986f1ecb860c31fb21150697f00e9 100644
--- a/lite/core/mir/kernel_place_correct_pass.h
+++ b/lite/core/mir/kernel_place_correct_pass.h
@@ -86,6 +86,8 @@ class KernelPlaceCorrectPass : public DebugPass {
                                    << node_name;
         VLOG(4) << "-- input arg_name:" << arg_name << " "
                 << "-- node name:" << node_name;
+
+        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
         if (!x_in->AsArg().type) {
           need_correct_place &= false;
         } else {
@@ -107,6 +109,8 @@ class KernelPlaceCorrectPass : public DebugPass {
                                    << node_name << " in Inst "
                                    << inst.op_type();
         VLOG(4) << "-- output arg_name " << arg_name;
+
+        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
         if (!x_out->AsArg().type) {
           need_correct_place &= false;
         } else {
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
old mode 100644
new mode 100755
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 4f6ea2ce470724c0b00993478c47eb0315b5a1e5..ce6bd3a36cd1d852f2d50f69c4be9e31b84b3f60 100755
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -139,7 +139,14 @@ void RuntimeProgram::Run() {
   for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
+#endif
     std::string op_type = inst.op()->op_info()->Type();
+
+    VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr()
+            << " on Target " << TargetToStr(inst.kernel()->target());
+
+#ifndef LITE_WITH_FPGA
+    if (op_type == "feed" || op_type == "fetch") continue;
 #endif
     inst.Run();
 #ifdef LITE_WITH_PROFILE
diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h
index e01ffc25e29ca94166e8fe12b0643ae9e914001d..2449e1e5d3fb721a39760e78a0417bf9491d8cef 100644
--- a/lite/gen_code/paddle_infer.h
+++ b/lite/gen_code/paddle_infer.h
@@ -46,7 +46,7 @@ class Tensor {
  */
 class PaddlePredictor {
  public:
-  void Init();
+  void Init() {}
 
   std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
   std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc
index 8fcbb8cffe72935e4df503c3c1748ddb68247fb7..93072fe499eed296d6e31d87ee9b74494de07aa1 100644
--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
@@ -59,6 +59,7 @@ void SequencePoolCompute::Run() {
   for (int i = 0; i <= batch_size; i++) {
     offset_new[i] = i;
   }
+  (output->mutable_lod())->clear();
   (output->mutable_lod())->push_back(offset_new);
 }
 
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479..7251d875276b7ce7492c14742aea332a4f5e22d2 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -5,28 +5,32 @@ endif()
 set(fpga_deps fpga_target_wrapper kernel_fpga)
 
 
-# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
+add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
-# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+
+add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
+
 # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
 add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
 add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
+
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
 add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
-# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
+add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
 # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
 add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
-# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
-# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
+add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
+add_kernel(split_compute_fpga FPGA basic SRCS split_compute.cc DEPS ${fpga_deps})
+add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
 
 add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
 add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc
index ecd9af0f8da5df62a15637e88dc4564efb187f6c..f6704204d34c309835c1de0ef61afed97c0b29e3 100644
--- a/lite/kernels/fpga/activation_compute.cc
+++ b/lite/kernels/fpga/activation_compute.cc
@@ -25,10 +25,10 @@ using float16 = zynqmp::float16;
 void ReluCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   auto output_data = param.Out->mutable_data<float16>();
-  zynqmp::InputParam& input_param = pe_.param();
+  zynqmp::InputParam& relu_param = pe_.param();
 
-  input_param.input = param.X->ZynqTensor();
-  input_param.output = param.Out->ZynqTensor();
+  relu_param.input = param.X->ZynqTensor();
+  relu_param.output = param.Out->ZynqTensor();
   pe_.init();
   pe_.apply();
 }
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
index 69e600a043389fcc36bda2906f38432f2771aaf8..bd6adf60934eda84a0f95c1d3bca6f801b566484 100644
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -72,12 +72,6 @@ void ConvCompute::PrepareForRun() {
       conv_param.activeParam.type = zynqmp::TYPE_RELU;
     }
 
-    // conv_param.filter->saveToFile("conv_filter_", true);
-    // if (param.bias != nullptr) {
-    //   std::cout << "param.bias != nullptr" << std::endl;
-    //   conv_param.bias()->saveToFile("conv_bias_", true);
-    // }
-
     conv_pe_.init();
     conv_pe_.apply();
   }
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
index d22cc7abacc2ecd80e54aa5c62a7e57671b920c9..39780d82276188b141e31d89466fbe09434393aa 100755
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -80,21 +80,21 @@ void ElementwiseMulCompute::PrepareForRun() {
   scale_param.activeParam.type = zynqmp::TYPE_NONE;
 
   int channel = scale_param.input->shape().channel();
-  zynqmp::Tensor* scale = new zynqmp::Tensor();
-  zynqmp::Tensor* bias = new zynqmp::Tensor();
-  scale_param.scale = scale;
-  scale_param.bias = bias;
+  scale_param.scale = &scale_;
+  scale_param.bias = &bias_;
   zynqmp::Shape shape(zynqmp::N, {channel});
-  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
-  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+  zynqmp::float16* scale_data =
+      scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
+  zynqmp::float16* bias_data =
+      bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
   float scale_value = param.Y->data<float>()[0];
 
-  for (int i = 0; i < channel; ++i) {
+  for (int i = 0; i < channel; i++) {
     if (param.Y->dims().production() != 1) {
       scale_value = param.Y->ZynqTensor()->data<float>()[i];
     }
-    scale_data[i] = scale_value;
-    bias_data[i] = 0;
+    scale_data[i] = zynqmp::float_to_half(scale_value);
+    bias_data[i] = zero_;
   }
 
   pe_.init();
@@ -102,6 +102,10 @@ void ElementwiseMulCompute::PrepareForRun() {
 }
 
 void ElementwiseMulCompute::Run() {
+  auto& param = Param<operators::ElementwiseParam>();
+  param.Y->ZynqTensor()->flush();
+  scale_.copyFrom(param.Y->ZynqTensor());
+  scale_.invalidate();
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ScaleParam& scale_param = pe_.param();
diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h
index e3e9c52c4c660e9ae6852f2ec8cdd815829ad524..9fa4991161dff6bba6c860838863b1cb38393877 100644
--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
@@ -61,6 +61,9 @@ class ElementwiseMulCompute
 
  private:
   zynqmp::ScalePE pe_;
+  zynqmp::Tensor scale_;
+  zynqmp::Tensor bias_;
+  zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f);
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc
index 51c9d54cad9054e4767860096d42e9c991d4f936..de7d23f09b89ac9308af773d91ed8f444a45a6ad 100644
--- a/lite/kernels/fpga/elementwise_compute_test.cc
+++ b/lite/kernels/fpga/elementwise_compute_test.cc
@@ -93,21 +93,6 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
   }
   // do elementwise add/sub/max...
   if (elt_type == "add") {
-    // for (int i = 0; i < batch; ++i) {
-    //   for (int j = 0; j < channels; ++j) {
-    //     int offset = (i * channels + j) * num;
-    //     const dtype* din_ptr = x_data + offset;
-    //     const dtype diny_data = y_data[j];
-    //     dtype* dout_ptr = out_data + offset;
-    //     for (int k = 0; k < num; ++k) {
-    //       *dout_ptr =
-    //       zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr),
-    //       zynqmp::half_to_float(diny_data)));
-    //       dout_ptr++;
-    //       din_ptr++;
-    //     }
-    //   }
-    // }
     int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
     for (int i = 0; i < count; ++i) {
       out_data[i] = zynqmp::float_to_half(sum(
@@ -229,75 +214,6 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
   ASSERT_TRUE(fusion_elementwise_add_activation.front());
 }
 
-// TEST(fusion_elementwise_add_activation_fpga, init) {
-//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-//   ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
-//   ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
-// }
-
-// TEST(fusion_elementwise_add_activation_fpga, compute) {
-//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-//   operators::FusionElementwiseActivationParam param;
-//   lite::Tensor x, y, output, output_ref;
-
-//   for (auto act_type : {"relu"}) {
-//     for (auto n : {1}) {
-//       for (auto c : {8}) {
-//         for (auto h : {8}) {
-//           for (auto w : {8}) {
-//             for (auto axis : {0}) {
-//               for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
-//                 auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-//                 auto y_dim = DDim(yd);
-//                 int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-//                 if (axis_t + y_dim.size() > 4) continue;
-//                 bool flag = false;
-//                 for (int i = 0; i < y_dim.size(); i++) {
-//                   if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-//                 }
-//                 if (flag) continue;
-
-//                 x.Resize(x_dim);
-//                 y.Resize(y_dim);
-//                 output.Resize(x_dim);
-//                 output_ref.Resize(x_dim);
-//                 auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-//                 auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
-//                 auto* output_data =
-//                 output.mutable_data<float16>(TARGET(kFPGA));
-//                 auto* output_ref_data =
-//                     output_ref.mutable_data<float16>(TARGET(kFPGA));
-//                 for (int i = 0; i < x_dim.production(); i++) {
-//                   float sign = i % 3 == 0 ? -1.0f : 1.0f;
-//                   x_data[i] = zynqmp::float_to_half(i * sign);
-//                 }
-//                 for (int i = 0; i < y_dim.production(); i++) {
-//                   float sign = i % 2 == 0 ? 0.5f : -0.5f;
-//                   y_data[i] = zynqmp::float_to_half(i * sign);
-//                 }
-//                 param.X = &x;
-//                 param.Y = &y;
-//                 param.axis = axis;
-//                 param.Out = &output;
-//                 param.act_type = act_type;
-//                 fusion_elementwise_add_activation.SetParam(param);
-//                 fusion_elementwise_add_activation.PrepareForRun();
-//                 fusion_elementwise_add_activation.Run();
-//                 param.Out = &output_ref;
-//                 elementwise_compute_ref<float16>(param, "add", act_type);
-//                 for (int i = 0; i < output.dims().production(); i++) {
-//                   EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-//                 }
-//               }
-//             }
-//           }
-//         }
-//       }
-//     }
-//   }
-// }
-
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
index 79329e99a3e5e812dca487c17452f3f5d1e96449..9ca2424bc2f8a748c348cac4aafd219e538c7a17 100755
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -40,8 +40,8 @@ void FeedCompute::PrepareForRun() {
 void FeedCompute::Run() {
   auto& param = this->Param<param_t>();
   Tensor& x = param.feed_list->at(param.col);
+  pe_.param().input = x.ZynqTensor();
   pe_.dispatch();
-
   auto out_lod = param.out->mutable_lod();
   *out_lod = x.lod();
 
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
index 2d296f4d4a89b1fd86e5b2330d3caf44fbad0903..d5c8585aaefd6dd54e3f3f603173b1790023ede8 100755
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -55,6 +55,7 @@ void FetchCompute::Run() {
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::OutputParam& fetch_param = pe_.param();
   Debugger::get_instance().registerOutput("fetch", fetch_param.output);
+  Debugger::get_instance().setEnable(true);
 #endif
 }
 
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
index 4554c24e07de656b948826c2fa6f9526f61daaa6..8b515532453d41eb504fabb228e491f0d5a3c00e 100755
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -45,21 +45,32 @@ class IoCopyHostToFpgaCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-    param.y->mutable_data<float16>();
-    if (param.x->ZynqTensor()->aligned() &&
-        param.x->ZynqTensor()->shape().shouldAlign()) {
-      zynqmp::Tensor tempTensor;
-      tempTensor.mutableData<float16>(zynqmp::FP16,
-                                      param.x->ZynqTensor()->shape());
-      tempTensor.copyFrom(param.x->ZynqTensor());
-      tempTensor.setAligned(true);
-      tempTensor.unalignImage();
-      param.y->ZynqTensor()->copyFrom(&tempTensor);
-    } else {
+    param.x->ZynqTensor()->flush();
+
+    if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
+      param.y->mutable_data<int>();
       param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      return;
     }
-    param.y->ZynqTensor()->invalidate();
-    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+
+    if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) {
+      param.y->mutable_data<float16>();
+      if (param.x->ZynqTensor()->aligned() &&
+          param.x->ZynqTensor()->shape().shouldAlign()) {
+        zynqmp::Tensor tempTensor;
+        tempTensor.mutableData<float16>(zynqmp::FP16,
+                                        param.x->ZynqTensor()->shape());
+        tempTensor.copyFrom(param.x->ZynqTensor());
+        tempTensor.setAligned(true);
+        tempTensor.unalignImage();
+        param.y->ZynqTensor()->copyFrom(&tempTensor);
+      } else {
+        param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+      }
+      param.y->ZynqTensor()->invalidate();
+      param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    }
+
     auto out_lod = param.y->mutable_lod();
     *out_lod = param.x->lod();
   }
diff --git a/lite/kernels/fpga/mul_compute.cc b/lite/kernels/fpga/mul_compute.cc
index c27600d9f773ff0aae04a2ee519905bc0e58785c..659c8dfb653c1707105a7337493ee4f3b3357b76 100755
--- a/lite/kernels/fpga/mul_compute.cc
+++ b/lite/kernels/fpga/mul_compute.cc
@@ -80,7 +80,8 @@ void mul(MulCompute* k) {
 }
 
 void MulCompute::Run() {
-  pe_.dispatch();
+  // pe_.dispatch();
+  mul(this);
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::FullyConnectedParam& fc_param = pe_.param();
   Debugger::get_instance().registerOutput("mul", fc_param.output);
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
index 4834054df6371a9faaa17bd17b53a29b999ddf03..23a5aad8e694d33cc30adec114e520620685178e 100644
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -318,14 +318,29 @@ void MultiClassOutput(const Tensor& scores,
 
 void MulticlassNmsCompute::Run() {
   auto& param = Param<operators::MulticlassNmsParam>();
-  auto* boxes = param.bboxes;
-  auto* scores = param.scores;
+  auto* boxes_in = param.bboxes;
+  auto* scores_in = param.scores;
   auto* outs = param.out;
   outs->mutable_data<float>();
 
-  auto score_dims = scores->dims();
+  auto score_dims = boxes_in->dims();
   auto score_size = score_dims.size();
 
+  Tensor boxes_float;
+  Tensor scores_float;
+
+  boxes_float.Resize(boxes_in->dims());
+  scores_float.Resize(scores_in->dims());
+
+  boxes_float.mutable_data<float>();
+  scores_float.mutable_data<float>();
+
+  boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor());
+  scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor());
+
+  Tensor* boxes = &boxes_float;
+  Tensor* scores = &scores_float;
+
   auto box_dims = boxes->dims();
   int64_t box_dim = boxes->dims()[2];
 
@@ -383,6 +398,7 @@ void MulticlassNmsCompute::Run() {
         MultiClassOutput<float>(
             scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
         outs->ZynqTensor()->copyFrom(out.ZynqTensor());
+        out.ZynqTensor()->saveToFile("nms_oo", true);
       }
       outs->Resize({static_cast<int64_t>(e - s), out_dim});
     }
@@ -402,16 +418,16 @@ void MulticlassNmsCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(multiclass_nms,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::MulticlassNmsCompute,
-                     def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(multiclass_nms,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::fpga::MulticlassNmsCompute,
+//                      def)
+//     .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+//     .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+//     .Finalize();
 
 REGISTER_LITE_KERNEL(multiclass_nms,
                      kFPGA,
@@ -427,5 +443,8 @@ REGISTER_LITE_KERNEL(multiclass_nms,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
     .Finalize();
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
index afd14ccb4b4a9a4f1e93e1e38840035fb18186bb..a11e67d837b81b03a8cca753bc409509ca5833b6 100644
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -131,3 +131,27 @@ REGISTER_LITE_KERNEL(prior_box,
     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+// REGISTER_LITE_KERNEL(prior_box,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::fpga::PriorBoxCompute,
+//                      def)
+//     .BindInput("Input",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindInput("Image",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindOutput("Boxes",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .BindOutput("Variances",
+//                {LiteType::GetTensorTy(TARGET(kFPGA),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kNHWC))})
+//     .Finalize();
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
index f72f18892c987e48fb3467372352f6ded98444ff..b79051f5b168abd857b5ec09cb048984da3dd4e1 100644
--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -38,18 +38,15 @@ void ReshapeCompute::Run() {
     auto* actual_shape_data = actual_shape->data<int>();
     auto shape = std::vector<int>(
         actual_shape_data, actual_shape_data + actual_shape_dims.production());
-    output_dims = lite::operators::ValidateShape(shape, x_dims);
+    // output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO
     output->Resize(output_dims);
   }
-  if (inplace) {
-    output->ShareDataWith(*x);
-  } else {
-    output->CopyDataFrom(*x);
-  }
-
-  param.x->ZynqTensor()->saveToFile("reshape_in", true);
-  output->ZynqTensor()->saveToFile("reshape_out", true);
-
+  // if (inplace) {
+  //   output->ShareDataWith(*x);
+  // } else {
+  //   output->CopyDataFrom(*x);
+  // }
+  output->ZynqTensor()->copyFrom(x->ZynqTensor());
   output->Resize(output_dims);
 }
 
diff --git a/lite/kernels/fpga/split_compute.cc b/lite/kernels/fpga/split_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..518503d67ff28b209ed9d7e76d441ef46b3bfd4d
--- /dev/null
+++ b/lite/kernels/fpga/split_compute.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/split_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+void SplitCompute::PrepareForRun() {
+  auto& param = Param<operators::SplitParam>();
+  zynqmp::SplitParam& split_param = pe_.param();
+  split_param.input = param.x->ZynqTensor();
+  auto& dout = param.output;
+  for (int i = 0; i < dout.size(); i++) {
+    dout[i]->mutable_data<zynqmp::float16>();
+    split_param.outputs.push_back(dout[i]->ZynqTensor());
+  }
+
+  pe_.init();
+  pe_.apply();
+}
+
+void SplitCompute::Run() {
+  zynqmp::SplitParam& split_param = pe_.param();
+  pe_.dispatch();
+
+#ifdef FPGA_PRINT_TENSOR
+  auto& dout = param.output;
+  for (int i = 0; i < dout.size(); i++) {
+    Debugger::get_instance().registerOutput("split", split_param.outputs[0]);
+  }
+
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    split, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::SplitCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SectionsTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/split_compute.h b/lite/kernels/fpga/split_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7680a66495c4e31591ecf6bdcdc73e3a71d802e
--- /dev/null
+++ b/lite/kernels/fpga/split_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/split_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class SplitCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~SplitCompute() = default;
+
+ private:
+  zynqmp::SplitPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
index e3bb813873d69d8f9d9939f06869e2640f416915..4ffeb4c82b10cee4094fbee53c7f39014e7fab84 100644
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
@@ -81,7 +81,17 @@ void transposeCompute(operators::TransposeParam param) {
 }
 
 // Transpose
-void TransposeCompute::Run() { auto& param = this->Param<param_t>(); }
+void TransposeCompute::Run() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<zynqmp::float16>();
+  param.x->ZynqTensor()->invalidate();
+  param.x->ZynqTensor()->unalignImage();
+  if (param.x->dims().size() != 4) {
+    transposeCompute(param);
+  } else {
+    param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+  }
+}
 
 // Transpose2
 void Transpose2Compute::Run() {
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index c212fb9b0465824b7a87eef2e87033bf967736e5..3ef584ab37280ad59b7670c7f414d1fcbc522316 100755
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+
 add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
 
 #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
old mode 100644
new mode 100755
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
old mode 100644
new mode 100755