diff --git a/.gitignore b/.gitignore
index 9db2912c07bc2d6abb01c322a25519ac0ff158fa..ce40fea2be877c09bb299781d8937c081843b50c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,3 +104,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+
+# generated files
+lite/api/paddle_use_kernels.h
+lite/api/paddle_use_ops.h
+lite/backends/arm/math/dotprod/gemm_sdot.h
+lite/tools/cmake_tools/ast.pyc
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100644
new mode 100755
index 77a94bea1efcdafaa67b4c078bfb0a756f7b1cec..786b1322b346631d1570a6ebd9c572302531db4e
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ if (WITH_PADDLE_MOBILE)
     return()
 endif(WITH_PADDLE_MOBILE)
 
+# set(CMAKE_BUILD_TYPE DEBUG)
+
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)
diff --git a/fpga.sh b/fpga.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e0501ac14b5269139688169017c057bd2458ab7c
--- /dev/null
+++ b/fpga.sh
@@ -0,0 +1,5 @@
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  test
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
old mode 100644
new mode 100755
index 70239e94e7a3064fb383246623d05a2079dda1fa..c3388350228207f843c9cbd2c1a3525ba0ef5645
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -198,14 +198,24 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels})
 
+    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
+
+    lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
+
     lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
        DEPS ${lite_model_test_DEPS}
        CL_DEPS ${opencl_kernels}
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
             --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
     add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
-   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
-   #    DEPS ${lite_model_test_DEPS})
+   lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
+      DEPS ${lite_model_test_DEPS})
 
    # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
    #    DEPS ${lite_model_test_DEPS}
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 990d08f18f541088d797510e9dbd4881d42b164f..9afe7e264a960144637df609daeca80f4ed3b2ac 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
             << kpf_path;
 }
 
+#ifndef LITE_WITH_FPGA
 lite::Tensor *Predictor::GetInput(size_t offset) {
   CHECK(input_names_.size() > offset)
       << "The network has " << input_names_.size() << " inputs"
@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
                 << " in exec_scope";
   return in_var->GetMutable<lite::Tensor>();
 }
+#else
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = exec_scope_->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+#endif
 
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() {
   }
 }
 
+#ifndef LITE_WITH_FPGA
+
 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
   CHECK(output_names_.size() > offset)
       << "The network has " << output_names_.size() << " outputs"
@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
   }
   return outputs;
 }
+#else
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+
+  std::vector<const lite::Tensor *> outputs;
+  for (auto out : fetch_list) {
+    outputs.push_back(&out);
+  }
+  return outputs;
+}
+
+#endif
 
 const cpp::ProgramDesc &Predictor::program_desc() const {
   return program_desc_;
diff --git a/lite/api/inceptionv3_test_fpga.cc b/lite/api/inceptionv3_test_fpga.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1dff7990e965465e73ed895c17a15646ef1c993
--- /dev/null
+++ b/lite/api/inceptionv3_test_fpga.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+#ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  // std::vector<Place> valid_places(
+  //     {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
+
+  predictor.Build("",
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+}
+#endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/api/ocr_attention_test_fpga.cc b/lite/api/ocr_attention_test_fpga.cc
new file mode 100755
index 0000000000000000000000000000000000000000..326de883d1625f7196426094cc4ccec970f8a399
--- /dev/null
+++ b/lite/api/ocr_attention_test_fpga.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_file, "", "input_file");
+
+namespace paddle {
+namespace lite {
+
+void read_from_file(const std::string& path, float* data, int num) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    exit(-1);
+    return;
+  }
+
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+}
+
+void chw_to_hwc(float* src, float* dst, int channel, int height, int width) {
+  int amount_per_row = width * channel;
+  int index = 0;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        int dst_index = offset_height + w * channel + c;
+        dst[dst_index] = src[index];
+        index = index + 1;
+      }
+    }
+  }
+}
+
+void TestModel(const std::vector<Place>& valid_places,
+               const Place& preferred_place,
+               bool use_npu = false) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+
+  // predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
+  predictor.Build("", "attention/model", "attention/params", valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 100, 200})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  read_from_file(FLAGS_input_file, data, 100 * 200);
+  //=============================================
+  auto* init_ids = predictor.GetInput(1);
+  init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_ids = init_ids->mutable_data<float>();
+  auto ids_size = init_ids->dims().production();
+  for (int i = 0; i < ids_size; i++) {
+    data_ids[i] = 0;
+  }
+  auto lod_ids = init_ids->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
+  *lod_ids = lod_i;
+
+  //=============================================
+  auto* init_scores = predictor.GetInput(2);
+  init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_scores = init_scores->mutable_data<float>();
+  auto scores_size = input_tensor->dims().production();
+  for (int i = 0; i < scores_size; i++) {
+    data_scores[i] = 0;
+  }
+  auto lod_scores = init_scores->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
+  *lod_scores = lod_s;
+
+  //=============================================
+  auto* position_encoding = predictor.GetInput(3);
+  position_encoding->Resize(
+      DDim(std::vector<DDim::value_type>({1, 33, 10, 23})));
+  auto* position_encoding_data = position_encoding->mutable_data<float>();
+
+  float* temp_data = position_encoding_data;
+
+  for (int i = 0; i < position_encoding->dims().production(); ++i) {
+    temp_data[i] = 0;
+  }
+  int index = 0;
+  for (int i = 0; i < 10; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == row) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  for (int i = 0; i < 23; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == col) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  // chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23);
+  // delete[] temp_data;
+
+  // read_from_file("position_encoding.data", position_encoding_data, 33 * 10 *
+  // 23);
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  std::cout << "================== Speed Report ===================";
+  std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  auto* out = predictor.GetOutput(0);
+
+  std::string file = "plate_data/" + FLAGS_input_file.substr(9);
+  std::cout << "file:::" << file << std::endl;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+TEST(OcrAttention, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)});
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/api/resnet50_test_fpga.cc b/lite/api/resnet50_test_fpga.cc
index ab647f96998f1c0e73476369611218d0a7930c57..75e6f0cbbc43c3cd7eb9bfa89bc004554ea6f85b 100644
--- a/lite/api/resnet50_test_fpga.cc
+++ b/lite/api/resnet50_test_fpga.cc
@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
   std::vector<Place> valid_places(
       {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
 
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-                  valid_places);
+  predictor.Build(FLAGS_model_dir, "", "", valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
   input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
diff --git a/lite/api/test_ssd_fpga.cc b/lite/api/test_ssd_fpga.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb2d75671a637c8042b39e2e90d70f1ae9e6f2fd
--- /dev/null
+++ b/lite/api/test_ssd_fpga.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_file, "", "input_file");
+
+namespace paddle {
+namespace lite {
+
+std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
+  std::vector<std::string> files;
+  std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
+                                     [](DIR* dir) { dir&& closedir(dir); });
+  struct dirent* dirent_ptr;
+  if (!directory_ptr) {
+    std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
+    return files;
+  }
+
+  while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
+    files.push_back(std::string(dirent_ptr->d_name));
+  }
+  return files;
+}
+
+void readFromFile(int num, std::string path, float* data) {
+  std::ifstream file_stream(path);
+  // file_stream.open(path);
+  if (!file_stream.good()) {
+    std::cout << "file: " << path << " dones not exist!\n";
+    exit(-1);
+    return;
+  }
+  // float* data = mutableData<float>();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  file_stream.close();
+}
+
+// #ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  // predictor.Build(FLAGS_model_dir, "", "", valid_places);
+  predictor.Build("",
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  int width = 300;
+  int height = 300;
+
+  // std::ifstream file_stream(FLAGS_input_file);
+  // if (!file_stream.good()) {
+  //   std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
+  //   exit(-1);
+  //   return;
+  // }
+
+  // file_stream >> height;
+  // file_stream >> width;
+
+  input_tensor->Resize(
+      DDim(std::vector<DDim::value_type>({1, 3, height, width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  // readFromFile(item_size, "car.data", data);
+
+  int num = 3 * width * height;
+
+  // for (int i = 0; i < num; ++i) {
+  //   float value = 0;
+  //   file_stream >> value;
+  //   data[i] = value;
+  // }
+  // file_stream.close();
+
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  auto* out = predictor.GetOutput(0);
+  for (int i = 0; i < out->dims().production(); i++) {
+    std::cout << ":" << out->data<float>()[i] << std::endl;
+  }
+
+  std::string file = "output/" + FLAGS_input_file.substr(6);
+  std::cout << "file:::" << file << std::endl;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+
+  LOG(INFO) << "================== Speed Report ===================";
+}
+// #endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
old mode 100644
new mode 100755
index 2b9b23070616baf18f347c6b2af2d87a300d428f..5aa6511cdfbcfc831c14bcf03a0c2d8096e30aa4
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -32,7 +32,8 @@ class Debugger {
   }
 
   void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
-    if (op_type != "conv") {  // NOLINT
+    if (op_config[op_type]) {
+      tensor->saveToFile(op_type, true);
     }
   }
 
@@ -40,8 +41,19 @@ class Debugger {
   std::unordered_map<std::string, bool> op_config;
   Debugger() {
     op_config["concat"] = true;
+    op_config["pooling"] = true;
     op_config["conv"] = true;
     op_config["crop"] = true;
+    op_config["feed"] = true;
+    op_config["mul"] = true;
+    op_config["fetch"] = true;
+    op_config["boxes"] = true;
+    op_config["scores"] = true;
+    op_config["nms"] = true;
+    op_config["pb_boxes"] = true;
+    op_config["pb_variances"] = true;
+    // op_config["fc"] = true;
+    op_config["softmax"] = true;
   }
 };
 
@@ -131,9 +143,7 @@ inline void save_tensor(const lite::Tensor* t,
     chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
     data = dst;
   }
-
   save_float(data, name, t->numel());
-
   delete[] dst;
 }
 }  // namespace lite
diff --git a/lite/backends/fpga/KD/fpga_cv.cpp b/lite/backends/fpga/KD/fpga_cv.cpp
deleted file mode 100644
index 15a20e368b09f193e3f43b574ff3682ce96782ad..0000000000000000000000000000000000000000
--- a/lite/backends/fpga/KD/fpga_cv.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/fpga_cv.hpp"
-
-using paddle::zynqmp::float16;
-
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height) {
-  paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
-  paddle::zynqmp::config_inplace(inplace_args);
-
-  paddle::zynqmp::ImageInputArgs input_args = {nullptr};
-  input_args.address = nullptr;
-  input_args.scale_address = nullptr;
-
-  float16* input_image_address =
-      reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
-          input_width * input_height * input_channel * sizeof(float16)));
-  int index = 0;
-
-  for (int i = 0; i < input_width * input_height * input_channel; i++) {
-    input_image_address[i] = float16(1.0 * input[i]);
-  }
-
-  paddle::zynqmp::ResizeArgs resize_args = {0};
-
-  resize_args.input_width = input_width;
-  resize_args.input_height = input_height;
-  resize_args.image_channel = input_channel;
-  resize_args.output_width = output_width;
-  resize_args.output_height = output_height;
-  float height_ratio = static_cast<float>(input_height) /
-                       static_cast<float>(resize_args.output_height);
-  float width_ratio = static_cast<float>(input_width) /
-                      static_cast<float>(resize_args.output_width);
-  resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-  resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-
-  int output_size =
-      resize_args.output_width * resize_args.output_height * input_channel;
-  float16* fpga_output = reinterpret_cast<float16*>(
-      paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
-  resize_args.input_image_address = input_image_address;
-  resize_args.output_image_address = fpga_output;
-
-  memset(fpga_output, 0, output_size * sizeof(float16));
-  paddle::zynqmp::fpga_flush(
-      input_image_address,
-      input_width * input_height * input_channel * sizeof(float16));
-  paddle::zynqmp::fpga_flush(resize_args.output_image_address,
-                             output_size * sizeof(float16));
-  int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
-  if (ret == 0) {
-    paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
-                                    output_size * sizeof(float16));
-  }
-
-  for (int i = 0; i < output_size; i++) {
-    output[i] = fpga_output[i];
-  }
-}
diff --git a/lite/backends/fpga/KD/fpga_cv.hpp b/lite/backends/fpga/KD/fpga_cv.hpp
deleted file mode 100644
index 6aa52edfbb704a0571fb1052aff6ecf022e49596..0000000000000000000000000000000000000000
--- a/lite/backends/fpga/KD/fpga_cv.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/pe.hpp"
-
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height);
diff --git a/lite/backends/fpga/KD/llapi/config.h b/lite/backends/fpga/KD/llapi/config.h
deleted file mode 100755
index acf8c8adf4fc5593dcc4238ddc762fdb9fea6760..0000000000000000000000000000000000000000
--- a/lite/backends/fpga/KD/llapi/config.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_LITE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_LITE_PROFILE
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
old mode 100644
new mode 100755
index 30250969b6fbe6e9e5ce7e9f96f963e8bee89224..b6932bc27f0019af58cea00e4b5422396d838208
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
   std::ofstream ofs;
   ofs.open(name);
 
-  int8_t* data = static_cast<int8_t*> data_in;
+  int8_t* data = static_cast<int8_t*>(data_in);
   for (int i = 0; i < size; i++) {
     float value = data[i];
     ofs << value << std::endl;
@@ -221,6 +221,7 @@ int8_t* format_filter(float* data_in,
       align_to_x(num_per_div_before_alignment, filter_num_alignment);
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  // int num_after_alignment = num_per_div_after_alignment * div_num;
   int residual = num % num_per_div_before_alignment;
   int num_after_alignment = num_per_div_after_alignment *
                                 ((residual == 0) ? div_num : (div_num - 1)) +
diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
old mode 100755
new mode 100644
index 06488469d97c077a34b3cfdb8a049c8cd61dfc93..68d0b6c68b722f9c5cf31139ed7308516889bd8c
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -62,6 +62,7 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
 #ifdef ENABLE_DEBUG
+  std::cout << "fpga_malloc:" << size << std::endl;
 #endif
 #ifdef PADDLE_OS_LINUX
   void *ptr = reinterpret_cast<void *>(
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
old mode 100644
new mode 100755
index fb15eaf77822eed076ec2001bace6871e93587ff..f274ccab0b755ebd9bf26bed4b41902d29bc1305
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -62,6 +62,7 @@ class ConvPE : public PE {
         param_.filter->shape().height() == 1) {  // NOLINT
     }
     if (!use_cpu_) {  // NOLINT
+      // param_.filter->releaseData();
     }
   }
 
@@ -92,6 +93,7 @@ class ConvPE : public PE {
     int kernel_width = param_.filter->shape().width();
     int kernel_step_h = param_.strides[0];
     int kernel_step_w = param_.strides[1];
+
     int pooled_height_ = output->shape().height();
     int pooled_width_ = out_width;
     int filter_chw = image_channels * kernel_height * kernel_width;
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
index ecee45569c8df3d3e3926b2ca78cb49da8415aa4..8751f013967ed3b44a6c6b11560a2f350bc7d6bf 100755
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -266,8 +266,8 @@ inline void split_filter_num(const ConvParam& c_param) {
   int filter_num_alignment = filter::get_filter_num_alignment();
   int aligned_num =
       align_to_x(num / param.groups, filter_num_alignment) * param.groups;
-  split_num = filter::calc_split_num(aligned_num, div_capacity);
 
+  split_num = filter::calc_split_num(aligned_num, div_capacity);
   Shape& out_shape = out->shape();
   for (int i = 0; i < split_num; i++) {
     BasicConvParam* conv_param = new BasicConvParam();
@@ -364,6 +364,7 @@ inline void split_filter_num(const ConvParam& c_param) {
     args.image.height = input->shape().height();
     args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
+
     args.dilation = param.dilations[0];
 
     args.output.address = out_address;
@@ -419,6 +420,7 @@ inline void split_channel(const ConvParam& c_param) {
     }
     scale.flush();
     bias.flush();
+    // Shape sb_shape(N, {2 * channel});
     format_scale_bias(&scale,
                       &bias,
                       &conv_param->filter,
@@ -446,6 +448,7 @@ inline void split_channel(const ConvParam& c_param) {
     args.image.height = conv_param->input.shape().height();
     args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
+
     args.dilation = param.dilations[0];
     args.output.address = conv_param->output.mutableData<void>();
     args.output.scale_address = conv_param->output.scale();
@@ -476,6 +479,7 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
   }
   size_t size = params.size();
   if (ret == 0 && size > 1) {
+    // Tensor* output = conv_params.output;
     Tensor& img = params[0]->output;
     for (int i = 0; i < 1; i++) {
       for (int i = 0; i < img.shape().numel(); i++) {
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
index 0efca2ec2e60e8973d92f41463b0444722f2a73b..d610780628612f5a4ac322f06c2c6e9ca7812925 100755
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -62,6 +62,7 @@ class DepthwiseConvPE : public PE {
       float16* scale_data = param_.scale()->data<float16>();
       float16* filter_data = param.quantizedFilter()->mutableData<float16>(
           FP16, param.filter->shape());
+
       memcpy(filter_data,
              scale_data,
              param.filter->shape().numel() * sizeof(float16));
diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp
old mode 100644
new mode 100755
index dcacab4eeef32b245d4126b72597b398a6627ba6..bbdf2f371f13ce6e4b1ecb6104dec8f35c1f9c3d
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -47,8 +47,10 @@ class GRUPE : public PE {
     zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
     float16* prev_hidden_data =
         prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    // set previous hidden data to 0;
     memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
 
+    // copy 2/3 weight from param.weight;
     zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
     float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
     memset(weight_data, 0, weight_shape.numel() * sizeof(float));
@@ -115,11 +117,9 @@ class GRUPE : public PE {
       if (hidden_prev) {
         // TODO(chonwhite): change to pre_out;
         prev_hidden_.copyFrom(value.pre_output);
-        prev_hidden_.saveToFile("prev_.txt");
       }
-
       mul_pe_.dispatch();
-      reset_hidden_.saveToFile("reset_hidden_.txt");
+      // reset_hidden_.saveToFile("reset_hidden_.txt");
       update_gate_data += stride_update;
       reset_gate_data += stride_update;
 
@@ -170,6 +170,7 @@ class GRUPE : public PE {
   zynqmp::Tensor bias_;
   zynqmp::Tensor weight_;
   zynqmp::Tensor state_weight_;
+
   zynqmp::Tensor update_gate_;
   zynqmp::Tensor reset_gate_;
   zynqmp::Tensor cell_state_;
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
index a8725b51a690e0e134785fcfdb3dd70edeffd441..84ed4f946e1a394cb0fc40d7c156faf534e1f8db 100755
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -66,7 +66,7 @@ class PoolingPE : public PE {
     param_.poolingArgs = args;
 
     use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
-               (k_width > 7 || k_height > 7);
+               (k_width > 255 || k_height > 255);
     use_cpu_ = param_.type == AVERAGE;
   }
 
@@ -76,6 +76,7 @@ class PoolingPE : public PE {
     input->syncToCPU();
 
     Tensor float_input;
+    // Tensor float_output;
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
@@ -188,7 +189,9 @@ class PoolingPE : public PE {
 
   bool dispatch() {
     if (use_cpu_) {
+      // cpu_compute();
       compute();
+      // exit(-1);
       return true;
     }
     param_.input->syncToDevice();
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
old mode 100644
new mode 100755
index cc89ac943f90cb20062a3d6ef9a46b705193ad04..09755c65a322da8ccab0d57dd2e877712b112361
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -89,7 +89,6 @@ class ScalePE : public PE {
           }
         }
       }
-
       float* scale_data_float = param_.scale->data<float>();
       for (int i = 0; i < repeat; i++) {
         for (int j = 0; j < length; j++) {
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
old mode 100644
new mode 100755
index f1b07d02622fad32e99205667424a4cb3c9fb46d..f247741a02758a3eb0cfa7f6c653d21e4263601d
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -348,9 +348,19 @@ class Tensor {
     if (placeHolder_ == nullptr) {
       return;
     }
+    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
   }
 
-  void printScale(std::string type) { printScale(); }
+  void printScale(std::string type) {
+    std::cout << type << " : "
+              << std::to_string(shape_->num()) + "_" +
+                     std::to_string(shape_->channel()) + "_" +
+                     std::to_string(shape_->height()) + "_" +
+                     std::to_string(shape_->width())
+              << std::endl;
+    std::cout << type << " \n";
+    printScale();
+  }
 
   std::string dimsFileName() {
     return std::to_string(shape_->num()) + "_" +
@@ -378,6 +388,7 @@ class Tensor {
     static int counter = 0;
     std::string npath = std::to_string(counter) + "_" + path;
     counter++;
+    std::cout << "======== saving file:" << npath << " ============\n";
     save_file_with_name(npath);
   }
 
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
old mode 100644
new mode 100755
index 311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0..49aded3d7d7db6d293e13298d98c2f3b165f411f
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -165,6 +165,9 @@ class TensorLite {
 
   TargetType target() const { return target_; }
 
+  // template <typename T>
+  // TensorLite Slice(int64_t begin, int64_t end) const;
+
   zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
 
   friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -254,6 +257,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
     int64_t base = numel() / dims_[0];
 
     TensorLite dst;
+
     dst.target_ = target_;
     auto dst_dims = dims_;
     dst_dims[0] = end - begin;
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
old mode 100644
new mode 100755
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
old mode 100644
new mode 100755
index ae74bd8d4d5647139a13509dfda0bb2b41ecc5c7..17a327f2535a88d943dd36e8b5f4f5d2c8f629cf
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
   auto io_copy_output_name =
       string_format("%s/target_trans", in->AsArg().name.c_str());
   // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
-
   if (copied_nodes->count(in->AsArg().name)) {
     // Remove the old link
     RemoveDirectedLink(in, inst_node);
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
old mode 100644
new mode 100755
diff --git a/lite/core/program.cc b/lite/core/program.cc
index b0c61bf00ed29e2fa71072b64f11f6ba30f77691..2c90a12b7709323468ed21ab244e3829b62f2ebb 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -138,11 +138,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 void RuntimeProgram::Run() {
   for (auto& inst : instructions_) {
     std::string op_type = inst.op()->op_info()->Type();
+
+#ifndef LITE_WITH_FPGA
     if (op_type == "feed" || op_type == "fetch") continue;
+#endif
     inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
-    LITE_PRECISION_PROFILE(inst)
+#ifndef LITE_WITH_FPGA
+// LITE_PRECISION_PROFILE(inst)
+#endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
old mode 100644
new mode 100755
diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h
index e01ffc25e29ca94166e8fe12b0643ae9e914001d..2449e1e5d3fb721a39760e78a0417bf9491d8cef 100644
--- a/lite/gen_code/paddle_infer.h
+++ b/lite/gen_code/paddle_infer.h
@@ -46,7 +46,7 @@ class Tensor {
  */
 class PaddlePredictor {
  public:
-  void Init();
+  void Init() {}
 
   std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
   std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
old mode 100644
new mode 100755
index 266ae1fc916af4303aca274c39b9b4923fdbb154..0b92317ac51b0af24443ec24436f6a483198dbbc
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -62,6 +62,10 @@ void CastCompute::Run() {
     int32_t* out_data = param.Out->mutable_data<int32_t>();
     std::transform(
         x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 5) {
+    const auto* x_data = param.X->data<float>();
+    auto* o_data = param.Out->mutable_data<float>();
+    memcpy(o_data, x_data, sizeof(float) * param.X->numel());
   } else {
     LOG(FATAL) << "other has not been implemented";
   }
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
old mode 100644
new mode 100755
index ad475538576b9cc73a43bac49cba1a6cf1c73edb..badd3f90288e0885aacdef6c53fbb6cc9b73ea7d
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -60,25 +60,10 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<ARMContext>();
 
-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.Out->template mutable_data<float>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.Out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.Out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
+    // auto data = param.Out->template mutable_data<T>();
+    auto data = param.Out->template mutable_data<float>();
+    for (int i = 0; i < param.Out->numel(); i++) {
+      data[i] = param.value;
     }
   }
 
@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<ARMContext>();
 
-    if (param.input->lod().size() && param.input_dim_idx == 0) {
-      auto odims = param.out->dims();
-      odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
-      param.out->Resize(odims);
+    // auto data = param.out->template mutable_data<T>();
+    auto data = param.out->template mutable_data<float>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
     }
 
-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.out->template mutable_data<float>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
-    }
+    // if (param.input->lod().size() && param.input_dim_idx == 0) {
+    //   auto odims = param.out->dims();
+    //   odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
+    //   param.out->Resize(odims);
+    // }
+
+    // if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
+    //   auto data = param.out->template mutable_data<float>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else if (param.dtype ==
+    //            static_cast<int32_t>(lite::core::FluidType::INT32)) {
+    //   auto data = param.out->template mutable_data<int32_t>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else if (param.dtype ==
+    //            static_cast<int32_t>(lite::core::FluidType::INT8)) {
+    //   auto data = param.out->template mutable_data<int8_t>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else {
+    //   LOG(FATAL) << "not supported dtype " << param.dtype;
+    // }
   }
 
   virtual ~FillConstantBatchLikeCompute() = default;
@@ -144,6 +135,7 @@ REGISTER_LITE_KERNEL(fill_constant,
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
+
 REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
                      kARM,
                      kAny,
@@ -153,3 +145,4 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
+
diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc
index bc52c5ea3ee452033cfd3c7d559cb88b21ca48f6..221f081feb0dc9873a183d5df215342da7fef6b7 100644
--- a/lite/kernels/arm/layout_compute.cc
+++ b/lite/kernels/arm/layout_compute.cc
@@ -59,6 +59,8 @@ namespace arm {
 template <>
 void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
   NCHWTONHWC(float);
+  // auto& param = this->template Param<param_t>();
+  // param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
 }
 
 template <>
@@ -69,6 +71,9 @@ void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
 template <>
 void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
   NHWCTONCHW(float);
+  // auto& param = this->template Param<param_t>();
+  // param.y->mutable_data<float>();
+  // param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
 }
 
 template <>
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
old mode 100644
new mode 100755
index ba58b378f4dda22fd78ce76b80bdbca8d8f284a3..fa7e2c0c3ae4580f5d19e82f7c48c74db3058847
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,6 +28,7 @@ namespace arm {
 
 void LookupTableCompute::Run() {
   auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
   // inputs
   auto w = param.W;
   auto ids = param.Ids;
@@ -36,7 +37,7 @@ void LookupTableCompute::Run() {
 
   auto table_dim = w->dims();
   int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<int64_t>();
+  auto ids_data = ids->data<float>();
 
   int64_t row_number = table_dim[0];
   int64_t row_width = table_dim[1];
@@ -75,14 +76,3 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(lookup_table_v2,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::LookupTableCompute,
-                     def)
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index 7c47e72872ecae6216288c20fa1a6ae30fac65bd..e71e5255ca6daa0c86c7f1b1c3d9174df66cac25 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -7,7 +7,9 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
 
 # add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
-# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+
+add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
@@ -16,9 +18,11 @@ add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS
 
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
+
 # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
 add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
 add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
+
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
 add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
index 7670bf0007def88c27c12ea54c569a7fcf263693..79329e99a3e5e812dca487c17452f3f5d1e96449 100755
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(feed,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FeedCompute,
+                     def_host)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
old mode 100644
new mode 100755
index 9b5f3f60232bb8527f823395693cf3b3851bc04e..2d296f4d4a89b1fd86e5b2330d3caf44fbad0903
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
 }
 
 void FetchCompute::Run() {
-  pe_.dispatch();
   auto& param = this->Param<param_t>();
+  auto fetch_list = param.fetch_list;
+  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
+    fetch_list->resize(param.col + 1);
+  }
+  Tensor& out = param.fetch_list->at(param.col);
+  out.Resize(param.input->dims());
+  pe_.dispatch();
 
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::OutputParam& fetch_param = pe_.param();
@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(fetch,
@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
                      kNHWC,
                      paddle::lite::kernels::fpga::FetchCompute,
                      host_host)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
diff --git a/lite/kernels/fpga/gru_compute.cc b/lite/kernels/fpga/gru_compute.cc
index 25fdcb505bcc8221da74b8cc87dc4fbec86b6190..a157382a6fb7ac39e4b102f5ac65dea337ed0f13 100755
--- a/lite/kernels/fpga/gru_compute.cc
+++ b/lite/kernels/fpga/gru_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <unistd.h>
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
 void GRUCompute::Run() {
   auto& param = this->Param<param_t>();
   param.hidden->mutable_data<float>();
+
   // inputs
   auto input = param.input;
   auto h0 = param.h0;
@@ -130,6 +132,7 @@ void GRUCompute::Run() {
     // //3.
     gru_value.prev_out_value = ordered_h0.mutable_data<float>();
     gru_tensors.pre_output = ordered_h0.ZynqTensor();
+
   } else {
     gru_value.prev_out_value = nullptr;
     gru_tensors.pre_output = nullptr;
@@ -169,6 +172,7 @@ void GRUCompute::Run() {
 
     float* hidden_data =
         hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
+
     gru_tensors.gate = &float_input;
     gru_tensors.output = &hidden_out;
 
@@ -187,11 +191,6 @@ void GRUCompute::Run() {
   *(batch_hidden->mutable_lod()) = batch_gate->lod();
   batch_hidden->mutable_data<float>();
   to_seq(*batch_hidden, hidden);
-
-  save_tensor(const_cast<Tensor*>(input), "_input.txt");
-  save_tensor(hidden, "_gru.txt");
-
-  exit(-1);
 }
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
old mode 100644
new mode 100755
index 10a0e3116b920a2f408606ef211f408ed2279f60..57a76dee97ca889cd645a2c8f81b5a2354f9b11f
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -119,7 +119,81 @@ class IoCopyFpgaToHostCompute
     auto out_lod = param.y->mutable_lod();
     *out_lod = param.x->lod();
   }
+  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
+};
+
+void hwc_to_chw(float* chw_data,
+                float* hwc_data,
+                int num,
+                int channel,
+                int height,
+                int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+class IoCopyFpgaToHostCHWCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kFPGA));
+
+    Tensor hwc;
+    hwc.Resize(param.y->dims());
+    float* hwc_data = hwc.mutable_data<float>();
+
+    float* chw_data = param.y->mutable_data<float>();
+    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
+    param.x->ZynqTensor()->syncToDevice();
 
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      hwc.ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+
+    int num = 1;
+    int channel = 1;
+    int height = 1;
+    int width = 1;
+
+    auto dims = param.y->ZynqTensor()->shape();
+
+    hwc_to_chw(chw_data,
+               hwc_data,
+               dims.num(),
+               dims.channel(),
+               dims.height(),
+               dims.width());
+
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->ZynqTensor()->flush();
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
+    // param.x->ZynqTensor()->saveToFile("io_x", true);
+    // param.y->ZynqTensor()->saveToFile("io_y", true);
+  }
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
 
@@ -170,7 +244,7 @@ REGISTER_LITE_KERNEL(io_copy,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kARM),
+                {LiteType::GetTensorTy(TARGET(kHost),
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
@@ -179,8 +253,8 @@ REGISTER_LITE_KERNEL(io_copy,
                      kFPGA,
                      kAny,
                      kAny,
-                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
-                     device_to_host_22)
+                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
+                     device_to_host_chw)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
index cee5e16205370df7faabc6f37d57fe360e8a9e67..4834054df6371a9faaa17bd17b53a29b999ddf03 100644
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
             scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
         outs->ZynqTensor()->copyFrom(out.ZynqTensor());
       }
+      outs->Resize({static_cast<int64_t>(e - s), out_dim});
     }
   }
   LoD lod;
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
index afd14ccb4b4a9a4f1e93e1e38840035fb18186bb..c889df17cb72a6d3e8ab02efc729ecc93fb38a5f 100644
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
 
   param.boxes->mutable_data<float>();
   param.variances->mutable_data<float>();
+
   zynqmp::PriorBoxParam& priobox_param = pe_.param();
   priobox_param.input = param.input->ZynqTensor();
   priobox_param.image = param.image->ZynqTensor();
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
old mode 100644
new mode 100755
index 428cc213ce63b8d24193a44f23d61fea78f63d6a..c6f2721d80b6fd584ce96e817476372e37b17ed8
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(one_hot_compute_host Host basic SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
 
 #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
 #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
index 6f6079ef88fd9e61dbacb35c0ca8bdac536288a9..82a694b363b4cc219c48c294fb7545b26492f973 100644
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -392,7 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms,
                      kNCHW,
                      paddle::lite::kernels::host::MulticlassNmsCompute,
                      def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("BBoxes",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
diff --git a/lite/kernels/host/one_hot_compute.cc b/lite/kernels/host/one_hot_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..e0af6f5173f367bb9b2e06de10499ee36806379c
--- /dev/null
+++ b/lite/kernels/host/one_hot_compute.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/host/one_hot_compute.h"
+#include "lite/utils/paddle_enforce.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+void OneHotCompute::Run() {
+  auto& param = Param<operators::OneHotParam>();
+  param.Out->mutable_data<float>();
+  int depth = param.depth;
+  if (param.depth_tensor) {
+    auto* depth_tensor = param.depth_tensor;
+    auto* depth_data = depth_tensor->data<int32_t>();
+    depth = depth_data[0];
+    auto in_dims = param.X->dims();
+    DDim out_dims(in_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    param.Out->Resize(out_dims);
+  }
+
+  auto* p_in_data = param.X->data<float>();
+  auto numel = param.X->numel();
+  auto* p_out_data = param.Out->mutable_data<float>();
+
+  for (int i = 0; i < param.Out->numel(); ++i) {
+    p_out_data[i] = 0;
+  }
+
+  if (param.allow_out_of_range) {
+    for (int i = 0; i < numel; ++i) {
+      if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) {
+        *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0;  // NOLINT
+      }
+    }
+  } else {
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(
+          p_in_data[i], 0, "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i],
+                        param.depth,
+                        "Illegal index value, should be less than depth (%d).",
+                        param.depth);
+      *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0;  // NOLINT
+    }
+  }
+}
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(one_hot,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::OneHotCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/host/one_hot_compute.h b/lite/kernels/host/one_hot_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..3a6c47fee31bc28f130c3de782c0c912c9f4b769
--- /dev/null
+++ b/lite/kernels/host/one_hot_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class OneHotCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override;
+
+  virtual ~OneHotCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/reshape_compute.cc b/lite/kernels/host/reshape_compute.cc
index 02f99787e60e73d91ca8f65cb42dcd4c56e7212b..7a826ed32b02a85860038482d8ca55c5db32a9bf 100644
--- a/lite/kernels/host/reshape_compute.cc
+++ b/lite/kernels/host/reshape_compute.cc
@@ -46,19 +46,43 @@ REGISTER_LITE_KERNEL(reshape,
                      paddle::lite::kernels::host::ReshapeCompute,
                      def)
     .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindInput("Shape",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
+// REGISTER_LITE_KERNEL(reshape,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::host::ReshapeCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(
+//                    TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
+//     .BindInput("ShapeTensor",
+//                {LiteType::GetTensorTy(
+//                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
+//     .BindInput("Shape",
+//                {LiteType::GetTensorTy(
+//                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(
+//                     TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
+//     .Finalize();
+
 REGISTER_LITE_KERNEL(reshape2,
                      kHost,
                      kAny,
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
old mode 100644
new mode 100755
index 190cf7194c19a47f377755a9e9b61d890bc1a262..a0c631e517afc8b3cdf9e97b00e327c477b6d026
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -136,6 +136,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
 add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
 add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
 
+add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
+
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
                 DEPS fc_op memory
diff --git a/lite/operators/one_hot_op.cc b/lite/operators/one_hot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..023cdc23aeb8329736b7438af2c52cbfa899c75c
--- /dev/null
+++ b/lite/operators/one_hot_op.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/one_hot_op.h"
+#include "lite/core/op_registry.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool OneHotOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool OneHotOp::InferShape() const {
+  CHECK_OR_FALSE(param_.Out);
+  // TODO(Superjomn) Enable data sharing.
+  auto out_dims = param_.X->dims();
+
+  out_dims[out_dims.size() - 1] = param_.depth;
+  param_.Out->Resize(out_dims);
+  return true;
+}
+
+bool OneHotOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X =
+      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+
+  if (opdesc.HasInput("depth_tensor")) {
+    auto depth_tensor = opdesc.Input("depth_tensor").front();
+    param_.depth_tensor =
+        scope->FindVar(depth_tensor)->GetMutable<lite::Tensor>();
+  }
+
+  CHECK(param_.X);
+  CHECK(param_.Out);
+  param_.depth = opdesc.GetAttr<int>("depth");
+  param_.dtype = opdesc.GetAttr<int>("dtype");
+
+  if (opdesc.HasAttr("allow_out_of_range")) {
+    param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
+  }
+
+  auto out_lod = param_.Out->mutable_lod();
+  *out_lod = param_.X->lod();
+  // param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(one_hot, paddle::lite::operators::OneHotOp);
diff --git a/lite/operators/one_hot_op.h b/lite/operators/one_hot_op.h
new file mode 100755
index 0000000000000000000000000000000000000000..4a0613952520279699a0f4a56d002483de325241
--- /dev/null
+++ b/lite/operators/one_hot_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class OneHotOp : public OpLite {
+ public:
+  OneHotOp() {}
+  explicit OneHotOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "one_hot"; }
+
+ private:
+  mutable OneHotParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
old mode 100644
new mode 100755
index cfee6a0391d81992069d70e9ac37e0e6594bd305..4f27e7a0d7b5bcfdbef5463d9cb352813f651bbf
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -1130,7 +1130,15 @@ struct GridSamplerParam {
   lite::Tensor* out{};
   lite::Tensor* grid{};
 };
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
+/// --------------------- attentions operators --------------
+struct OneHotParam {
+  lite::Tensor* X{};
+  lite::Tensor* depth_tensor{nullptr};
+  lite::Tensor* Out{};
+  int depth{-1};
+  int dtype{};
+  bool allow_out_of_range{false};
+};
+};  // namespace operators
+};  // namespace lite
+};  // namespace paddle
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index e1610b60d3b1b104699ab175bca3bb3cf81bd40b..6121186e7c983145f2f9f450f6a23ea1957bb496 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 
 
 # global variables
-BUILD_EXTRA=OFF
+BUILD_EXTRA=ON
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
diff --git a/lite/tools/build_fpga.sh b/lite/tools/build_fpga.sh
index f8c186e92fc3ba23e5e09b6a139202d028e58fc6..ab10798fe7da34ddd88b2fab0bcc0e5f4b8ce233 100755
--- a/lite/tools/build_fpga.sh
+++ b/lite/tools/build_fpga.sh
@@ -2,12 +2,16 @@
 
 build_dir=build_fpga
 mkdir -p ${build_dir}
-cd ${build_dir}
 
-GEN_CODE_PATH_PREFIX=lite/gen_code
-mkdir -p ./${GEN_CODE_PATH_PREFIX}
-touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+root_dir=$(pwd)
+build_dir=${build_dir}
+# in build directory
+# 1. Prepare gen_code file
+GEN_CODE_PATH_PREFIX=${build_dir}/lite/gen_code
+mkdir -p ${GEN_CODE_PATH_PREFIX}
+touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
 
+cd ${build_dir}
 cmake .. \
         -DWITH_GPU=OFF \
         -DWITH_MKL=OFF \
@@ -19,8 +23,9 @@ cmake .. \
         -DLITE_WITH_OPENMP=ON \
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=OFF \
-        -DARM_TARGET_OS=armlinux
-
-make -j8
+        -DARM_TARGET_OS=armlinux \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_PROFILE=OFF
 
+make -j42
 cd -