merge attention_diff into fpga_pr

c6d82e0e · chonwhite · a867dbbf · f010edd0 · c6d82e0e · c6d82e0e
57 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -104,3 +104,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+# generated files
+lite/api/paddle_use_kernels.h
+lite/api/paddle_use_ops.h
+lite/backends/arm/math/dotprod/gemm_sdot.h
+lite/tools/cmake_tools/ast.pyc
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ if (WITH_PADDLE_MOBILE)
    return()
 endif(WITH_PADDLE_MOBILE)
+# set(CMAKE_BUILD_TYPE DEBUG)
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)

--- a/fpga.sh
+++ b/fpga.sh
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  test
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -198,14 +198,24 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels})
+    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
+    lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       FPGA_DEPS ${fpga_kernels})
    lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
       DEPS ${lite_model_test_DEPS}
       CL_DEPS ${opencl_kernels}
       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
-   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
+   lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
-   #    DEPS ${lite_model_test_DEPS})
+      DEPS ${lite_model_test_DEPS})
   # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
   #    DEPS ${lite_model_test_DEPS}

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
            << kpf_path;
 }
+#ifndef LITE_WITH_FPGA
 lite::Tensor *Predictor::GetInput(size_t offset) {
  CHECK(input_names_.size() > offset)
      << "The network has " << input_names_.size() << " inputs"
@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
                << " in exec_scope";
  return in_var->GetMutable<lite::Tensor>();
 }
+#else
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = exec_scope_->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+#endif
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() {
  }
 }
+#ifndef LITE_WITH_FPGA
 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
  CHECK(output_names_.size() > offset)
      << "The network has " << output_names_.size() << " outputs"
@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
  }
  return outputs;
 }
+#else
+const lite::Tensor *Predictor::GetOutput(size_t offset) const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  std::vector<const lite::Tensor *> outputs;
+  for (auto out : fetch_list) {
+    outputs.push_back(&out);
+  }
+  return outputs;
+}
+#endif
 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;

--- a/lite/api/inceptionv3_test_fpga.cc
+++ b/lite/api/inceptionv3_test_fpga.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+#ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  // std::vector<Place> valid_places(
+  //     {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
+  predictor.Build("",
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+}
+#endif
+}  // namespace lite
+}  // namespace paddle
--- a/lite/api/ocr_attention_test_fpga.cc
+++ b/lite/api/ocr_attention_test_fpga.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+DEFINE_string(input_file, "", "input_file");
+namespace paddle {
+namespace lite {
+void read_from_file(const std::string& path, float* data, int num) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    exit(-1);
+    return;
+  }
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+}
+void chw_to_hwc(float* src, float* dst, int channel, int height, int width) {
+  int amount_per_row = width * channel;
+  int index = 0;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        int dst_index = offset_height + w * channel + c;
+        dst[dst_index] = src[index];
+        index = index + 1;
+      }
+    }
+  }
+}
+void TestModel(const std::vector<Place>& valid_places,
+               const Place& preferred_place,
+               bool use_npu = false) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+  // predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
+  predictor.Build("", "attention/model", "attention/params", valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 100, 200})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+  read_from_file(FLAGS_input_file, data, 100 * 200);
+  //=============================================
+  auto* init_ids = predictor.GetInput(1);
+  init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_ids = init_ids->mutable_data<float>();
+  auto ids_size = init_ids->dims().production();
+  for (int i = 0; i < ids_size; i++) {
+    data_ids[i] = 0;
+  }
+  auto lod_ids = init_ids->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
+  *lod_ids = lod_i;
+  //=============================================
+  auto* init_scores = predictor.GetInput(2);
+  init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_scores = init_scores->mutable_data<float>();
+  auto scores_size = input_tensor->dims().production();
+  for (int i = 0; i < scores_size; i++) {
+    data_scores[i] = 0;
+  }
+  auto lod_scores = init_scores->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
+  *lod_scores = lod_s;
+  //=============================================
+  auto* position_encoding = predictor.GetInput(3);
+  position_encoding->Resize(
+      DDim(std::vector<DDim::value_type>({1, 33, 10, 23})));
+  auto* position_encoding_data = position_encoding->mutable_data<float>();
+  float* temp_data = position_encoding_data;
+  for (int i = 0; i < position_encoding->dims().production(); ++i) {
+    temp_data[i] = 0;
+  }
+  int index = 0;
+  for (int i = 0; i < 10; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == row) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  for (int i = 0; i < 23; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == col) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  // chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23);
+  // delete[] temp_data;
+  // read_from_file("position_encoding.data", position_encoding_data, 33 * 10 *
+  // 23);
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+  std::cout << "================== Speed Report ===================";
+  std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  auto* out = predictor.GetOutput(0);
+  std::string file = "plate_data/" + FLAGS_input_file.substr(9);
+  std::cout << "file:::" << file << std::endl;
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+TEST(OcrAttention, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)});
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/api/resnet50_test_fpga.cc
+++ b/lite/api/resnet50_test_fpga.cc
@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
  std::vector<Place> valid_places(
      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
-  predictor.Build(FLAGS_model_dir,
+  predictor.Build(FLAGS_model_dir, "", "", valid_places);
-                  "",
-                  "",
-                  Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-                  valid_places);
  auto* input_tensor = predictor.GetInput(0);
  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));

--- a/lite/api/test_ssd_fpga.cc
+++ b/lite/api/test_ssd_fpga.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <dirent.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+DEFINE_string(input_file, "", "input_file");
+namespace paddle {
+namespace lite {
+std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
+  std::vector<std::string> files;
+  std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
+                                     [](DIR* dir) { dir&& closedir(dir); });
+  struct dirent* dirent_ptr;
+  if (!directory_ptr) {
+    std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
+    return files;
+  }
+  while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
+    files.push_back(std::string(dirent_ptr->d_name));
+  }
+  return files;
+}
+void readFromFile(int num, std::string path, float* data) {
+  std::ifstream file_stream(path);
+  // file_stream.open(path);
+  if (!file_stream.good()) {
+    std::cout << "file: " << path << " dones not exist!\n";
+    exit(-1);
+    return;
+  }
+  // float* data = mutableData<float>();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  file_stream.close();
+}
+// #ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  // predictor.Build(FLAGS_model_dir, "", "", valid_places);
+  predictor.Build("",
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  int width = 300;
+  int height = 300;
+  // std::ifstream file_stream(FLAGS_input_file);
+  // if (!file_stream.good()) {
+  //   std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
+  //   exit(-1);
+  //   return;
+  // }
+  // file_stream >> height;
+  // file_stream >> width;
+  input_tensor->Resize(
+      DDim(std::vector<DDim::value_type>({1, 3, height, width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+  // readFromFile(item_size, "car.data", data);
+  int num = 3 * width * height;
+  // for (int i = 0; i < num; ++i) {
+  //   float value = 0;
+  //   file_stream >> value;
+  //   data[i] = value;
+  // }
+  // file_stream.close();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+  auto* out = predictor.GetOutput(0);
+  for (int i = 0; i < out->dims().production(); i++) {
+    std::cout << ":" << out->data<float>()[i] << std::endl;
+  }
+  std::string file = "output/" + FLAGS_input_file.substr(6);
+  std::cout << "file:::" << file << std::endl;
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+  LOG(INFO) << "================== Speed Report ===================";
+}
+// #endif
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -32,7 +32,8 @@ class Debugger {
  }
  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
-    if (op_type != "conv") {  // NOLINT
+    if (op_config[op_type]) {
+      tensor->saveToFile(op_type, true);
    }
  }
@@ -40,8 +41,19 @@ class Debugger {
  std::unordered_map<std::string, bool> op_config;
  Debugger() {
    op_config["concat"] = true;
+    op_config["pooling"] = true;
    op_config["conv"] = true;
    op_config["crop"] = true;
+    op_config["feed"] = true;
+    op_config["mul"] = true;
+    op_config["fetch"] = true;
+    op_config["boxes"] = true;
+    op_config["scores"] = true;
+    op_config["nms"] = true;
+    op_config["pb_boxes"] = true;
+    op_config["pb_variances"] = true;
+    // op_config["fc"] = true;
+    op_config["softmax"] = true;
  }
 };
@@ -131,9 +143,7 @@ inline void save_tensor(const lite::Tensor* t,
    chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
    data = dst;
  }
  save_float(data, name, t->numel());
  delete[] dst;
 }
 }  // namespace lite

--- a/lite/backends/fpga/KD/fpga_cv.cpp
+++ b/lite/backends/fpga/KD/fpga_cv.cpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "lite/backends/fpga/KD/fpga_cv.hpp"
-using paddle::zynqmp::float16;
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height) {
-  paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
-  paddle::zynqmp::config_inplace(inplace_args);
-  paddle::zynqmp::ImageInputArgs input_args = {nullptr};
-  input_args.address = nullptr;
-  input_args.scale_address = nullptr;
-  float16* input_image_address =
-      reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
-          input_width * input_height * input_channel * sizeof(float16)));
-  int index = 0;
-  for (int i = 0; i < input_width * input_height * input_channel; i++) {
-    input_image_address[i] = float16(1.0 * input[i]);
-  }
-  paddle::zynqmp::ResizeArgs resize_args = {0};
-  resize_args.input_width = input_width;
-  resize_args.input_height = input_height;
-  resize_args.image_channel = input_channel;
-  resize_args.output_width = output_width;
-  resize_args.output_height = output_height;
-  float height_ratio = static_cast<float>(input_height) /
-                       static_cast<float>(resize_args.output_height);
-  float width_ratio = static_cast<float>(input_width) /
-                      static_cast<float>(resize_args.output_width);
-  resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-  resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-  int output_size =
-      resize_args.output_width * resize_args.output_height * input_channel;
-  float16* fpga_output = reinterpret_cast<float16*>(
-      paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
-  resize_args.input_image_address = input_image_address;
-  resize_args.output_image_address = fpga_output;
-  memset(fpga_output, 0, output_size * sizeof(float16));
-  paddle::zynqmp::fpga_flush(
-      input_image_address,
-      input_width * input_height * input_channel * sizeof(float16));
-  paddle::zynqmp::fpga_flush(resize_args.output_image_address,
-                             output_size * sizeof(float16));
-  int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
-  if (ret == 0) {
-    paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
-                                    output_size * sizeof(float16));
-  }
-  for (int i = 0; i < output_size; i++) {
-    output[i] = fpga_output[i];
-  }
-}
--- a/lite/backends/fpga/KD/fpga_cv.hpp
+++ b/lite/backends/fpga/KD/fpga_cv.hpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stdlib.h>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/pe.hpp"
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height);
--- a/lite/backends/fpga/KD/llapi/config.h
+++ b/lite/backends/fpga/KD/llapi/config.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#define PADDLE_LITE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_LITE_PROFILE
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
  std::ofstream ofs;
  ofs.open(name);
-  int8_t* data = static_cast<int8_t*> data_in;
+  int8_t* data = static_cast<int8_t*>(data_in);
  for (int i = 0; i < size; i++) {
    float value = data[i];
    ofs << value << std::endl;
@@ -221,6 +221,7 @@ int8_t* format_filter(float* data_in,
      align_to_x(num_per_div_before_alignment, filter_num_alignment);
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  // int num_after_alignment = num_per_div_after_alignment * div_num;
  int residual = num % num_per_div_before_alignment;
  int num_after_alignment = num_per_div_after_alignment *
                                ((residual == 0) ? div_num : (div_num - 1)) +

--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -62,6 +62,7 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
 #ifdef ENABLE_DEBUG
+  std::cout << "fpga_malloc:" << size << std::endl;
 #endif
 #ifdef PADDLE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(

--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -62,6 +62,7 @@ class ConvPE : public PE {
        param_.filter->shape().height() == 1) {  // NOLINT
    }
    if (!use_cpu_) {  // NOLINT
+      // param_.filter->releaseData();
    }
  }
@@ -92,6 +93,7 @@ class ConvPE : public PE {
    int kernel_width = param_.filter->shape().width();
    int kernel_step_h = param_.strides[0];
    int kernel_step_w = param_.strides[1];
    int pooled_height_ = output->shape().height();
    int pooled_width_ = out_width;
    int filter_chw = image_channels * kernel_height * kernel_width;

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -266,8 +266,8 @@ inline void split_filter_num(const ConvParam& c_param) {
  int filter_num_alignment = filter::get_filter_num_alignment();
  int aligned_num =
      align_to_x(num / param.groups, filter_num_alignment) * param.groups;
-  split_num = filter::calc_split_num(aligned_num, div_capacity);
+  split_num = filter::calc_split_num(aligned_num, div_capacity);
  Shape& out_shape = out->shape();
  for (int i = 0; i < split_num; i++) {
    BasicConvParam* conv_param = new BasicConvParam();
@@ -364,6 +364,7 @@ inline void split_filter_num(const ConvParam& c_param) {
    args.image.height = input->shape().height();
    args.image.pad_width = param.paddings[1];
    args.image.pad_height = param.paddings[0];
    args.dilation = param.dilations[0];
    args.output.address = out_address;
@@ -419,6 +420,7 @@ inline void split_channel(const ConvParam& c_param) {
    }
    scale.flush();
    bias.flush();
+    // Shape sb_shape(N, {2 * channel});
    format_scale_bias(&scale,
                      &bias,
                      &conv_param->filter,
@@ -446,6 +448,7 @@ inline void split_channel(const ConvParam& c_param) {
    args.image.height = conv_param->input.shape().height();
    args.image.pad_width = param.paddings[1];
    args.image.pad_height = param.paddings[0];
    args.dilation = param.dilations[0];
    args.output.address = conv_param->output.mutableData<void>();
    args.output.scale_address = conv_param->output.scale();
@@ -476,6 +479,7 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
  }
  size_t size = params.size();
  if (ret == 0 && size > 1) {
+    // Tensor* output = conv_params.output;
    Tensor& img = params[0]->output;
    for (int i = 0; i < 1; i++) {
      for (int i = 0; i < img.shape().numel(); i++) {

--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -62,6 +62,7 @@ class DepthwiseConvPE : public PE {
      float16* scale_data = param_.scale()->data<float16>();
      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
          FP16, param.filter->shape());
      memcpy(filter_data,
             scale_data,
             param.filter->shape().numel() * sizeof(float16));

--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -47,8 +47,10 @@ class GRUPE : public PE {
    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
    float16* prev_hidden_data =
        prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    // set previous hidden data to 0;
    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
+    // copy 2/3 weight from param.weight;
    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
@@ -115,11 +117,9 @@ class GRUPE : public PE {
      if (hidden_prev) {
        // TODO(chonwhite): change to pre_out;
        prev_hidden_.copyFrom(value.pre_output);
-        prev_hidden_.saveToFile("prev_.txt");
      }
      mul_pe_.dispatch();
-      reset_hidden_.saveToFile("reset_hidden_.txt");
+      // reset_hidden_.saveToFile("reset_hidden_.txt");
      update_gate_data += stride_update;
      reset_gate_data += stride_update;
@@ -170,6 +170,7 @@ class GRUPE : public PE {
  zynqmp::Tensor bias_;
  zynqmp::Tensor weight_;
  zynqmp::Tensor state_weight_;
  zynqmp::Tensor update_gate_;
  zynqmp::Tensor reset_gate_;
  zynqmp::Tensor cell_state_;

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -66,7 +66,7 @@ class PoolingPE : public PE {
    param_.poolingArgs = args;
    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
-               (k_width > 7 || k_height > 7);
+               (k_width > 255 || k_height > 255);
    use_cpu_ = param_.type == AVERAGE;
  }
@@ -76,6 +76,7 @@ class PoolingPE : public PE {
    input->syncToCPU();
    Tensor float_input;
+    // Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
@@ -188,7 +189,9 @@ class PoolingPE : public PE {
  bool dispatch() {
    if (use_cpu_) {
+      // cpu_compute();
      compute();
+      // exit(-1);
      return true;
    }
    param_.input->syncToDevice();

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -89,7 +89,6 @@ class ScalePE : public PE {
          }
        }
      }
      float* scale_data_float = param_.scale->data<float>();
      for (int i = 0; i < repeat; i++) {
        for (int j = 0; j < length; j++) {

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -348,9 +348,19 @@ class Tensor {
    if (placeHolder_ == nullptr) {
      return;
    }
+    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
  }
-  void printScale(std::string type) { printScale(); }
+  void printScale(std::string type) {
+    std::cout << type << " : "
+              << std::to_string(shape_->num()) + "_" +
+                     std::to_string(shape_->channel()) + "_" +
+                     std::to_string(shape_->height()) + "_" +
+                     std::to_string(shape_->width())
+              << std::endl;
+    std::cout << type << " \n";
+    printScale();
+  }
  std::string dimsFileName() {
    return std::to_string(shape_->num()) + "_" +
@@ -378,6 +388,7 @@ class Tensor {
    static int counter = 0;
    std::string npath = std::to_string(counter) + "_" + path;
    counter++;
+    std::cout << "======== saving file:" << npath << " ============\n";
    save_file_with_name(npath);
  }

--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -165,6 +165,9 @@ class TensorLite {
  TargetType target() const { return target_; }
+  // template <typename T>
+  // TensorLite Slice(int64_t begin, int64_t end) const;
  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
  friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -254,6 +257,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
    int64_t base = numel() / dims_[0];
    TensorLite dst;
    dst.target_ = target_;
    auto dst_dims = dims_;
    dst_dims[0] = end - begin;

--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
  auto io_copy_output_name =
      string_format("%s/target_trans", in->AsArg().name.c_str());
  // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
  if (copied_nodes->count(in->AsArg().name)) {
    // Remove the old link
    RemoveDirectedLink(in, inst_node);

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -138,11 +138,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 void RuntimeProgram::Run() {
  for (auto& inst : instructions_) {
    std::string op_type = inst.op()->op_info()->Type();
+#ifndef LITE_WITH_FPGA
    if (op_type == "feed" || op_type == "fetch") continue;
+#endif
    inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
-    LITE_PRECISION_PROFILE(inst)
+#ifndef LITE_WITH_FPGA
+// LITE_PRECISION_PROFILE(inst)
+#endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
  }

--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
--- a/lite/gen_code/paddle_infer.h
+++ b/lite/gen_code/paddle_infer.h
@@ -46,7 +46,7 @@ class Tensor {
 */
 class PaddlePredictor {
 public:
-  void Init();
+  void Init() {}
  std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
  std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);

--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -62,6 +62,10 @@ void CastCompute::Run() {
    int32_t* out_data = param.Out->mutable_data<int32_t>();
    std::transform(
        x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 5) {
+    const auto* x_data = param.X->data<float>();
+    auto* o_data = param.Out->mutable_data<float>();
+    memcpy(o_data, x_data, sizeof(float) * param.X->numel());
  } else {
    LOG(FATAL) << "other has not been implemented";
  }

--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -60,26 +60,11 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<ARMContext>();
-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
+    // auto data = param.Out->template mutable_data<T>();
    auto data = param.Out->template mutable_data<float>();
    for (int i = 0; i < param.Out->numel(); i++) {
      data[i] = param.value;
    }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.Out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.Out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
-    }
  }
  virtual ~FillConstantCompute() = default;
@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<ARMContext>();
-    if (param.input->lod().size() && param.input_dim_idx == 0) {
+    // auto data = param.out->template mutable_data<T>();
-      auto odims = param.out->dims();
-      odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
-      param.out->Resize(odims);
-    }
-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
    auto data = param.out->template mutable_data<float>();
    for (int i = 0; i < param.out->numel(); i++) {
      data[i] = param.value;
    }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
+    // if (param.input->lod().size() && param.input_dim_idx == 0) {
-      auto data = param.out->template mutable_data<int32_t>();
+    //   auto odims = param.out->dims();
-      for (int i = 0; i < param.out->numel(); i++) {
+    //   odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
-        data[i] = param.value;
+    //   param.out->Resize(odims);
-      }
+    // }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
+    // if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.out->template mutable_data<int8_t>();
+    //   auto data = param.out->template mutable_data<float>();
-      for (int i = 0; i < param.out->numel(); i++) {
+    //   for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
+    //     data[i] = param.value;
-      }
+    //   }
-    } else {
+    // } else if (param.dtype ==
-      LOG(FATAL) << "not supported dtype " << param.dtype;
+    //            static_cast<int32_t>(lite::core::FluidType::INT32)) {
-    }
+    //   auto data = param.out->template mutable_data<int32_t>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else if (param.dtype ==
+    //            static_cast<int32_t>(lite::core::FluidType::INT8)) {
+    //   auto data = param.out->template mutable_data<int8_t>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else {
+    //   LOG(FATAL) << "not supported dtype " << param.dtype;
+    // }
  }
  virtual ~FillConstantBatchLikeCompute() = default;
@@ -144,6 +135,7 @@ REGISTER_LITE_KERNEL(fill_constant,
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
                     kARM,
                     kAny,
@@ -153,3 +145,4 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/arm/layout_compute.cc
+++ b/lite/kernels/arm/layout_compute.cc
@@ -59,6 +59,8 @@ namespace arm {
 template <>
 void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
  NCHWTONHWC(float);
+  // auto& param = this->template Param<param_t>();
+  // param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
 }
 template <>
@@ -69,6 +71,9 @@ void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
 template <>
 void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
  NHWCTONCHW(float);
+  // auto& param = this->template Param<param_t>();
+  // param.y->mutable_data<float>();
+  // param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
 }
 template <>

--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,6 +28,7 @@ namespace arm {
 void LookupTableCompute::Run() {
  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
  // inputs
  auto w = param.W;
  auto ids = param.Ids;
@@ -36,7 +37,7 @@ void LookupTableCompute::Run() {
  auto table_dim = w->dims();
  int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<int64_t>();
+  auto ids_data = ids->data<float>();
  int64_t row_number = table_dim[0];
  int64_t row_width = table_dim[1];
@@ -75,14 +76,3 @@ REGISTER_LITE_KERNEL(lookup_table,
    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
-REGISTER_LITE_KERNEL(lookup_table_v2,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::LookupTableCompute,
-                     def)
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -7,7 +7,9 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
 # add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
-# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
+add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
@@ -16,9 +18,11 @@ add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
 # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
 add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
 add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
 add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})

--- a/lite/kernels/fpga/calib_compute.h
+++ b/lite/kernels/fpga/calib_compute.h
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+REGISTER_LITE_KERNEL(feed,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FeedCompute,
+                     def_host)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
 }
 void FetchCompute::Run() {
-  pe_.dispatch();
  auto& param = this->Param<param_t>();
+  auto fetch_list = param.fetch_list;
+  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
+    fetch_list->resize(param.col + 1);
+  }
+  Tensor& out = param.fetch_list->at(param.col);
+  out.Resize(param.input->dims());
+  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::OutputParam& fetch_param = pe_.param();
@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny))})
-    .BindOutput("Out",
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(fetch,
@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
                     kNHWC,
                     paddle::lite::kernels::fpga::FetchCompute,
                     host_host)
-    .BindInput("X",
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
-               {LiteType::GetTensorTy(TARGET(kHost),
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
    .Finalize();
--- a/lite/kernels/fpga/gru_compute.cc
+++ b/lite/kernels/fpga/gru_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <unistd.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
 void GRUCompute::Run() {
  auto& param = this->Param<param_t>();
  param.hidden->mutable_data<float>();
  // inputs
  auto input = param.input;
  auto h0 = param.h0;
@@ -130,6 +132,7 @@ void GRUCompute::Run() {
    // //3.
    gru_value.prev_out_value = ordered_h0.mutable_data<float>();
    gru_tensors.pre_output = ordered_h0.ZynqTensor();
  } else {
    gru_value.prev_out_value = nullptr;
    gru_tensors.pre_output = nullptr;
@@ -169,6 +172,7 @@ void GRUCompute::Run() {
    float* hidden_data =
        hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
    gru_tensors.gate = &float_input;
    gru_tensors.output = &hidden_out;
@@ -187,11 +191,6 @@ void GRUCompute::Run() {
  *(batch_hidden->mutable_lod()) = batch_gate->lod();
  batch_hidden->mutable_data<float>();
  to_seq(*batch_hidden, hidden);
-  save_tensor(const_cast<Tensor*>(input), "_input.txt");
-  save_tensor(hidden, "_gru.txt");
-  exit(-1);
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/gru_compute.h
+++ b/lite/kernels/fpga/gru_compute.h
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -119,7 +119,81 @@ class IoCopyFpgaToHostCompute
    auto out_lod = param.y->mutable_lod();
    *out_lod = param.x->lod();
  }
+  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
+};
+void hwc_to_chw(float* chw_data,
+                float* hwc_data,
+                int num,
+                int channel,
+                int height,
+                int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+class IoCopyFpgaToHostCHWCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kFPGA));
+    Tensor hwc;
+    hwc.Resize(param.y->dims());
+    float* hwc_data = hwc.mutable_data<float>();
+    float* chw_data = param.y->mutable_data<float>();
+    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
+    param.x->ZynqTensor()->syncToDevice();
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      hwc.ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+    int num = 1;
+    int channel = 1;
+    int height = 1;
+    int width = 1;
+    auto dims = param.y->ZynqTensor()->shape();
+    hwc_to_chw(chw_data,
+               hwc_data,
+               dims.num(),
+               dims.channel(),
+               dims.height(),
+               dims.width());
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->ZynqTensor()->flush();
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
+    // param.x->ZynqTensor()->saveToFile("io_x", true);
+    // param.y->ZynqTensor()->saveToFile("io_y", true);
+  }
  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
@@ -170,7 +244,7 @@ REGISTER_LITE_KERNEL(io_copy,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kARM),
+                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
@@ -179,8 +253,8 @@ REGISTER_LITE_KERNEL(io_copy,
                     kFPGA,
                     kAny,
                     kAny,
-                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
+                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
-                     device_to_host_22)
+                     device_to_host_chw)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kFPGA),
                                      PRECISION(kFP16),

--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
        outs->ZynqTensor()->copyFrom(out.ZynqTensor());
      }
+      outs->Resize({static_cast<int64_t>(e - s), out_dim});
    }
  }
  LoD lod;

--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
  param.boxes->mutable_data<float>();
  param.variances->mutable_data<float>();
  zynqmp::PriorBoxParam& priobox_param = pe_.param();
  priobox_param.input = param.input->ZynqTensor();
  priobox_param.image = param.image->ZynqTensor();

--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(one_hot_compute_host Host basic SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
 #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
 #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -392,7 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms,
                     kNCHW,
                     paddle::lite::kernels::host::MulticlassNmsCompute,
                     def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("BBoxes",
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();
--- a/lite/kernels/host/one_hot_compute.cc
+++ b/lite/kernels/host/one_hot_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <map>
+#include <utility>
+#include <vector>
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/host/one_hot_compute.h"
+#include "lite/utils/paddle_enforce.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+void OneHotCompute::Run() {
+  auto& param = Param<operators::OneHotParam>();
+  param.Out->mutable_data<float>();
+  int depth = param.depth;
+  if (param.depth_tensor) {
+    auto* depth_tensor = param.depth_tensor;
+    auto* depth_data = depth_tensor->data<int32_t>();
+    depth = depth_data[0];
+    auto in_dims = param.X->dims();
+    DDim out_dims(in_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    param.Out->Resize(out_dims);
+  }
+  auto* p_in_data = param.X->data<float>();
+  auto numel = param.X->numel();
+  auto* p_out_data = param.Out->mutable_data<float>();
+  for (int i = 0; i < param.Out->numel(); ++i) {
+    p_out_data[i] = 0;
+  }
+  if (param.allow_out_of_range) {
+    for (int i = 0; i < numel; ++i) {
+      if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) {
+        *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0;  // NOLINT
+      }
+    }
+  } else {
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(
+          p_in_data[i], 0, "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i],
+                        param.depth,
+                        "Illegal index value, should be less than depth (%d).",
+                        param.depth);
+      *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0;  // NOLINT
+    }
+  }
+}
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(one_hot,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::OneHotCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/kernels/host/one_hot_compute.h
+++ b/lite/kernels/host/one_hot_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+class OneHotCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override;
+  virtual ~OneHotCompute() = default;
+};
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/host/reshape_compute.cc
+++ b/lite/kernels/host/reshape_compute.cc
@@ -46,19 +46,43 @@ REGISTER_LITE_KERNEL(reshape,
                     paddle::lite::kernels::host::ReshapeCompute,
                     def)
    .BindInput("X",
-               {LiteType::GetTensorTy(
+               {LiteType::GetTensorTy(TARGET(kHost),
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(
+               {LiteType::GetTensorTy(TARGET(kHost),
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindInput("Shape",
-               {LiteType::GetTensorTy(
+               {LiteType::GetTensorTy(TARGET(kHost),
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindOutput("Out",
-                {LiteType::GetTensorTy(
+                {LiteType::GetTensorTy(TARGET(kHost),
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
    .Finalize();
+// REGISTER_LITE_KERNEL(reshape,
+//                      kFPGA,
+//                      kFP16,
+//                      kNHWC,
+//                      paddle::lite::kernels::host::ReshapeCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(
+//                    TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
+//     .BindInput("ShapeTensor",
+//                {LiteType::GetTensorTy(
+//                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
+//     .BindInput("Shape",
+//                {LiteType::GetTensorTy(
+//                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(
+//                     TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
+//     .Finalize();
 REGISTER_LITE_KERNEL(reshape2,
                     kHost,
                     kAny,

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -136,6 +136,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
 add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
 add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
+add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
 if (NOT LITE_WITH_X86)
    lite_cc_test(test_fc_op SRCS fc_op_test.cc
                DEPS fc_op memory

--- a/lite/operators/one_hot_op.cc
+++ b/lite/operators/one_hot_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/one_hot_op.h"
+#include "lite/core/op_registry.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool OneHotOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+bool OneHotOp::InferShape() const {
+  CHECK_OR_FALSE(param_.Out);
+  // TODO(Superjomn) Enable data sharing.
+  auto out_dims = param_.X->dims();
+  out_dims[out_dims.size() - 1] = param_.depth;
+  param_.Out->Resize(out_dims);
+  return true;
+}
+bool OneHotOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X =
+      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  if (opdesc.HasInput("depth_tensor")) {
+    auto depth_tensor = opdesc.Input("depth_tensor").front();
+    param_.depth_tensor =
+        scope->FindVar(depth_tensor)->GetMutable<lite::Tensor>();
+  }
+  CHECK(param_.X);
+  CHECK(param_.Out);
+  param_.depth = opdesc.GetAttr<int>("depth");
+  param_.dtype = opdesc.GetAttr<int>("dtype");
+  if (opdesc.HasAttr("allow_out_of_range")) {
+    param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
+  }
+  auto out_lod = param_.Out->mutable_lod();
+  *out_lod = param_.X->lod();
+  // param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(one_hot, paddle::lite::operators::OneHotOp);
--- a/lite/operators/one_hot_op.h
+++ b/lite/operators/one_hot_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class OneHotOp : public OpLite {
+ public:
+  OneHotOp() {}
+  explicit OneHotOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "one_hot"; }
+ private:
+  mutable OneHotParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -1130,7 +1130,15 @@ struct GridSamplerParam {
  lite::Tensor* out{};
  lite::Tensor* grid{};
 };
+/// --------------------- attentions operators --------------
-}  // namespace operators
+struct OneHotParam {
-}  // namespace lite
+  lite::Tensor* X{};
-}  // namespace paddle
+  lite::Tensor* depth_tensor{nullptr};
+  lite::Tensor* Out{};
+  int depth{-1};
+  int dtype{};
+  bool allow_out_of_range{false};
+};
+};  // namespace operators
+};  // namespace lite
+};  // namespace paddle
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 # global variables
-BUILD_EXTRA=OFF
+BUILD_EXTRA=ON
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)

--- a/lite/tools/build_fpga.sh
+++ b/lite/tools/build_fpga.sh
@@ -2,12 +2,16 @@
 build_dir=build_fpga
 mkdir -p ${build_dir}
-cd ${build_dir}
-GEN_CODE_PATH_PREFIX=lite/gen_code
+root_dir=$(pwd)
-mkdir -p ./${GEN_CODE_PATH_PREFIX}
+build_dir=${build_dir}
-touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+# in build directory
+# 1. Prepare gen_code file
+GEN_CODE_PATH_PREFIX=${build_dir}/lite/gen_code
+mkdir -p ${GEN_CODE_PATH_PREFIX}
+touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+cd ${build_dir}
 cmake .. \
        -DWITH_GPU=OFF \
        -DWITH_MKL=OFF \
@@ -19,8 +23,9 @@ cmake .. \
        -DLITE_WITH_OPENMP=ON \
        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
        -DWITH_TESTING=OFF \
-        -DARM_TARGET_OS=armlinux
+        -DARM_TARGET_OS=armlinux \
+        -DLITE_BUILD_EXTRA=ON \
-make -j8
+        -DLITE_WITH_PROFILE=OFF
+make -j42
 cd -