removed some comments

945aa36f · chonwhite · c6f500fd · 945aa36f · 945aa36f · 945aa36f
33 changed file
--- a/lite/api/paddle_use_kernels.h
+++ b/lite/api/paddle_use_kernels.h
@@ -15,6 +15,7 @@ USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(prior_box_fpga, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);

--- a/lite/api/test_ssd_fpga.cc
+++ b/lite/api/test_ssd_fpga.cc
@@ -135,16 +135,16 @@ TEST(ResNet50, test) {
  //   std::cout << ":" << out1->data<float>()[i] << std::endl; 
  // }
-  // std::string file = "output/" + FLAGS_input_file.substr (6);
+  std::string file = "output/" + FLAGS_input_file.substr (6);
-  // std::cout << "file:::" << file << std::endl;
+  std::cout << "file:::" << file << std::endl;
-  // std::ofstream ofs;
+  std::ofstream ofs;
-  // ofs.open(file);
+  ofs.open(file);
-  // for (int i = 0; i < out->dims().production(); i++) {
+  for (int i = 0; i < out->dims().production(); i++) {
-  //   float value = out->data<float>()[i];
+    float value = out->data<float>()[i];
-  //   ofs << value << std::endl;
+    ofs << value << std::endl;
-  // }
+  }
-  // ofs.close();
+  ofs.close();
  LOG(INFO) << "================== Speed Report ===================";
 }

--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -5,6 +5,8 @@
 namespace paddle {
 namespace lite {
+#define FPGA_PRINT_TENSOR
 class Debugger {
 public:
  static Debugger& get_instance() {
@@ -12,7 +14,7 @@ class Debugger {
    return s_instance;
  }
-  void registerOutput(std::string op_type, Tensor* tensor) {
+  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
    // tensor->printScale();
    // tensor->saveToFile(op_type, true);
  }
@@ -101,8 +103,6 @@ inline void save_float(float* data, const std::string& name, int len) {
 }
 inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = true) {
  float* data = const_cast<float*>(t->data<float>());
 	float* dst = new float[t->numel()];
  if (convert) {
@@ -111,7 +111,6 @@ inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert =
  }
  save_float(data, name, t->numel());
  delete[] dst;
 }

--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp
@@ -86,10 +86,8 @@ void format_bias_array(float **bias_array, int num) {
    (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  std::cout << "bias::" << std::endl;
  for (int i = 0; i < num_before_align; i++) {
      float value = ptr_aligned[i];
-      std::cout << "@:" << i << " = " << value << std::endl;
      ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
  }
  *bias_array = (float *)ptr_aligned;  // NOLINT

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
-#define PADDLE_MOBILE_OS_LINUX
+#define PADDLE_OS_LINUX
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 static inline int do_ioctl(uint64_t req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_OS_LINUX
  return ioctl(fd, req, arg);
 #else
  return -1;
@@ -46,11 +46,9 @@ static inline int do_ioctl(uint64_t req, const void *arg) {
 }
 int open_device() {
-  // std::cout << "open_device" << std::endl;
  if (fd == -1) {
    fd = open(device_path, O_RDWR);
  }
-  // std::cout << "open_device fd:" << fd << std::endl;
  return fd;
 }
@@ -68,7 +66,7 @@ void *fpga_malloc(size_t size) {
 #ifdef ENABLE_DEBUG
 // std::cout << "fpga_malloc:" << size << std::endl;
 #endif
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
  if (ptr == NULL) {
@@ -113,7 +111,7 @@ void fpga_free(void *ptr) {
  memory_size -= size;
-#ifdef PADDLE_MOBILE_OS_LINUX
+#ifdef PADDLE_OS_LINUX
  munmap(ptr, size);
 #else

--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -64,14 +64,11 @@ class ConvPE : public PE {
    if (!use_cpu_) {
      // param_.filter->releaseData();
    }
-    // exit(-1);
  }
  void cpu_conv_hwc() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
    input->syncToCPU();

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -324,7 +324,7 @@ inline void split_filter_num(const ConvParam& c_param) {
    Shape s_shape(N, {filter_num});
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
-    std::cout << "v size: " << v.size() << std::endl;
+    // std::cout << "v size: " << v.size() << std::endl;
    for (int n = 0; n < filter_num; n++) {
        scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
      // scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];

--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -94,21 +94,7 @@ class FullyConnectedPE : public PE {
  }
  bool dispatch() { 
-    // return 
    return convPE_.dispatch();
-    // convPE_.dispatch(); 
-    // if (num_ == 1) {
-    //   return true;
-    // }
-    // Tensor* output = param_.output;
-    // int size = output->shape().numel() * sizeof(floa16);
-    // memcpy(output->data<void>(), tempOut_->data<void>(), size);
-    // for (int i = 1;i < num_;i ++) {
-    //   memcpy(output->data<void>(), tempOut_->data<void>(), size);
-    // }
-    // return true;
  }
  FullyConnectedParam& param() { return param_; }

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -395,7 +395,7 @@ class Tensor {
  }
  void save_file_with_name(std::string path) {
-    // return;
+    return;
    invalidate();
    // usleep(20000);
    // return;

--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -92,34 +92,17 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
 }
 void TensorLite::CopyDataFrom(const TensorLite &other) {
-  // std::cout << "1\n";
  dims_ = other.dims_;
-  // std::cout << "2\n";
  target_ = other.target_;
-  // std::cout << "3\n";
  lod_ = other.lod_;
  auto dt = zynq_tensor_->dataType();
-  // std::cout << "4\n";
-  // std::cout << "dt:" << dt << std::endl;
  auto shape = other.zynq_tensor_->shape();
  Resize(other.dims());
-  // mutable_data<float>();
  zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
-  // std::cout << "copy Data From: \n";
-  // std::cout << "ss" << (void*)(other.ZynqTensor()) << "\n";
  this->ZynqTensor()->copyFrom(other.ZynqTensor());
-  // set_lod(other.lod());
 }
-// template <typename T>
-// void TensorLite::mutable_data_internal() {
-// }
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -293,23 +293,7 @@ void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
  int64_t base = numel() / dims_[0];
  T* src_data = const_cast<T*>(data<T>());
-  std::cout << "end:" << end << " begin:" << begin << std::endl;
-  std::cout << "base:" << base << std::endl;
-  std::cout << "production:" << dst_dims.production() << std::endl;
  memcpy(dst_data, src_data + static_cast<size_t>(begin * dst_dims.production()), dst_dims.production() * sizeof(T));
-  // dst.ZynqTensor()->saveToFile("_slice", true);
-  // if (dims_[0] == 1) {
-  //   dst-
-  //   return;
-  // } else {
-  //   // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
-  //   return dst;
-  // }  
 }
 template <typename TensorT>

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -109,7 +109,8 @@ class Optimizer {
           "runtime_context_assign_pass",
           "argument_type_display_pass",
-           "memory_optimize_pass"}};
+           // "memory_optimize_pass"
+         }};
      RunPasses(passes_local);
    } else {
      RunPasses(passes);

--- a/lite/kernels/arm/prior_box_compute.cc
+++ b/lite/kernels/arm/prior_box_compute.cc
@@ -98,6 +98,18 @@ REGISTER_LITE_KERNEL(prior_box,
                     kNCHW,
                     paddle::lite::kernels::arm::PriorBoxCompute,
                     def)
+    .BindInput("Input",{LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(prior_box_fpga,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::PriorBoxCompute,
+                     def)
    .BindInput("Input",{LiteType::GetTensorTy(
                   TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
    .BindInput("Image", {LiteType::GetTensorTy(

--- a/lite/kernels/fpga/beam_search_decode_compute.cc
+++ b/lite/kernels/fpga/beam_search_decode_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/arm/beam_search_decode_compute.h"
-#include <algorithm>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-using LoDTensor = lite::Tensor;
-using LoDTensorArray = std::vector<lite::Tensor>;
-// all the lod have 2 levels.
-// The first is source level, the second is sentence level.
-// source level describe how many prefixes (branchs) for each source sentece
-// (beam). sentence level describe how these candidates belong to the prefixes.
-const size_t kSourceLevel = 0;
-const size_t kSentenceLevel = 1;
-template <typename T>
-struct Sentence {
-  std::vector<float> word_ids;
-  std::vector<T> scores;
-};
-template <typename T>
-using SentenceVector = std::vector<Sentence<T>>;
-template <typename T>
-struct BeamSearchDecoder {
-  BeamSearchDecoder(size_t beam_size, int end_id)
-      : beam_size_(beam_size), end_id_(end_id) {}
-  /**
-   * convert the result sentence_vector for each source sentence into two
-   * LodTensor.
-   * One is all candidate sentences with word id, one is all candidate sentences
-   * with word score.
-   * Param:
-   *  sentence_vector_list: sentence_vector for each source sentence.
-   *  id_tensor: result LoDTensor for sentences of id.
-   *  score_tensor: result LoDTensor for sentences of score.
-   *  reverse: whether ids of sentence in sentence_vector_list is reversed
-   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
-   */
-  void ConvertSentenceVectorToLodTensor(
-      std::vector<SentenceVector<T>> sentence_vector_list,
-      LoDTensor* id_tensor,
-      LoDTensor* score_tensor,
-      bool reverse = true,
-      bool sort_by_score = true) const {
-    size_t src_num = sentence_vector_list.size();
-    CHECK_GT(src_num, 0) << "src_num should not be 0";
-    std::vector<uint64_t> source_level_lod = {0};
-    std::vector<uint64_t> sentence_level_lod = {0};
-    std::vector<float> id_data;
-    std::vector<T> score_data;
-    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-      if (sort_by_score) {
-        sort(sentence_vector_list[src_idx].begin(),
-             sentence_vector_list[src_idx].end(),
-             [reverse](const Sentence<T>& a, const Sentence<T>& b) {
-               if (reverse)
-                 return a.scores.front() > b.scores.front();
-               else
-                 return a.scores.back() > b.scores.back();
-             });
-      }
-      for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-        if (reverse) {
-          id_data.insert(id_data.end(),
-                         sentence.word_ids.rbegin(),
-                         sentence.word_ids.rend());
-          score_data.insert(score_data.end(),
-                            sentence.scores.rbegin(),
-                            sentence.scores.rend());
-        } else {
-          id_data.insert(id_data.end(),
-                         sentence.word_ids.begin(),
-                         sentence.word_ids.end());
-          score_data.insert(
-              score_data.end(), sentence.scores.begin(), sentence.scores.end());
-        }
-        sentence_level_lod.push_back(sentence_level_lod.back() +
-                                     sentence.word_ids.size());
-      }
-      source_level_lod.push_back(source_level_lod.back() +
-                                 sentence_vector_list[src_idx].size());
-    }
-    LoD lod;
-    lod.push_back(source_level_lod);
-    lod.push_back(sentence_level_lod);
-    *(id_tensor->mutable_lod()) = lod;
-    id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-    auto id_ptr = id_tensor->mutable_data<float>();
-    TargetCopy(
-        TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float));
-    *(score_tensor->mutable_lod()) = lod;
-    score_tensor->Resize({static_cast<int64_t>(score_data.size())});
-    auto score_ptr = score_tensor->mutable_data<T>();
-    TargetCopy(TARGET(kARM),
-               score_ptr,
-               score_data.data(),
-               score_data.size() * sizeof(T));
-  }
-  /**
-   * Gather the hypotheses for each source sentence by backtrace though the
-   * LoDTensorArray step_ids whose lods reserve the path in the tree.
-   */
-  void Backtrace(const LoDTensorArray& step_ids,
-                 const LoDTensorArray& step_scores,
-                 LoDTensor* id_tensor,
-                 LoDTensor* score_tensor) const {
-    CHECK(!step_ids.empty()) << "step num should be larger than 0";
-    CHECK_EQ(step_ids.size(), step_scores.size())
-        << "step_ids and step_scores should be the same";
-    const size_t step_num = step_ids.size();
-    const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-    std::vector<SentenceVector<T>> sentence_vector_list(
-        src_num, SentenceVector<T>(beam_size_));
-    std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
-    for (int step_id = step_num - 1; step_id >= 0; --step_id) {
-      auto& cur_ids = step_ids.at(step_id);
-      auto& cur_scores = step_scores.at(step_id);
-      for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-        // for each source sentence
-        auto& sentence_vector = sentence_vector_list.at(src_idx);
-        auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
-        size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-        size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-        if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
-          // or the last time step
-          for (size_t prefix_idx = src_prefix_start;
-               prefix_idx < src_prefix_end;
-               ++prefix_idx) {
-            size_t candidate_start =
-                cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-            size_t candidate_end =
-                cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
-            for (size_t candidate_idx = candidate_start;
-                 candidate_idx < candidate_end;
-                 ++candidate_idx) {
-              prefix_idx_vector.push_back(prefix_idx);
-              size_t idx = prefix_idx_vector.size() - 1;
-              auto cur_id = cur_ids.data<float>()[candidate_idx];
-              auto cur_score = cur_scores.data<T>()[candidate_idx];
-              sentence_vector.at(idx).word_ids.push_back(cur_id);
-              sentence_vector.at(idx).scores.push_back(cur_score);
-            }
-          }
-        } else {  // use prefix_idx_vector to backtrace
-          size_t src_candidate_start =
-              cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
-          size_t prefix_idx = src_prefix_start;
-          size_t candidate_num =
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
-            auto candidate_idx = prefix_idx_vector.at(idx);
-            auto cur_id = cur_ids.data<float>()[candidate_idx];
-            auto cur_score = cur_scores.data<T>()[candidate_idx];
-            if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
-              // to skip redundant end tokens
-              sentence_vector.at(idx).word_ids.push_back(cur_id);
-              sentence_vector.at(idx).scores.push_back(cur_score);
-            }
-            while (src_candidate_start + candidate_num <=
-                   candidate_idx) {  // search the corresponding prefix
-              prefix_idx++;
-              candidate_num +=
-                  cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-                  cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-            }
-            prefix_idx_vector.at(idx) = prefix_idx;
-          }
-        }
-      }
-    }
-    ConvertSentenceVectorToLodTensor(
-        sentence_vector_list, id_tensor, score_tensor, true, true);
-  }
-  size_t beam_size_;
-  int end_id_;
-};
-struct BeamSearchDecodeFunctor {
-  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
-                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor,
-                          LoDTensor* score_tensor,
-                          size_t beam_size,
-                          int end_id)
-      : beam_size_(beam_size),
-        end_id_(end_id),
-        step_ids_(step_ids),
-        step_scores_(step_scores),
-        id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
-  template <typename T>
-  void apply() const {
-    BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
-    beam_search_decoder.Backtrace(
-        step_ids_, step_scores_, id_tensor_, score_tensor_);
-  }
-  size_t beam_size_;
-  int end_id_;
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
-  LoDTensor* id_tensor_;
-  LoDTensor* score_tensor_;
-};
-template <>
-void BeamSearchDecodeFunctor::apply<bool>() const {
-  LOG(FATAL) << "beam search decode op does not support bool!";
-}
-void BeamSearchDecodeCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  // inputs
-  auto ids = param.ids;
-  auto scores = param.scores;
-  // outputs
-  auto sentence_ids = param.sentence_ids;
-  auto sentence_scores = param.sentence_scores;
-  const size_t step_num = ids->size();
-  CHECK_GT(step_num, 0UL) << "beam search steps should be larger than 0";
-  const size_t source_num = ids->at(0).lod().at(0).size() - 1;
-  CHECK_GT(source_num, 0UL) << "source num should be larger than 0";
-  for (size_t i = 0; i < step_num; ++i) {
-    CHECK_EQ(ids->at(i).lod().size(), 2UL) << "Level of LodTensor should be 2";
-  }
-  //! fixme
-  // only support float score now
-  BeamSearchDecodeFunctor func(*ids,
-                               *scores,
-                               sentence_ids,
-                               sentence_scores,
-                               param.beam_size,
-                               param.end_id);
-  func.apply<float>();
-}
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(beam_search_decode,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BeamSearchDecodeCompute,
-                     def)
-    .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
--- a/lite/kernels/fpga/beam_search_decode_compute.h
+++ b/lite/kernels/fpga/beam_search_decode_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-class BeamSearchDecodeCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::BeamSearchDecodeParam;
-  BeamSearchDecodeCompute() = default;
-  void Run() override;
-  virtual ~BeamSearchDecodeCompute() = default;
-};
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
--- a/lite/kernels/fpga/box_coder_compute.cc
+++ b/lite/kernels/fpga/box_coder_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/fpga/box_coder_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/fpga/KD/float16.hpp"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-using float16 = zynqmp::float16;
-void BoxCoderCompute::Run() {
-  auto& param = Param<operators::ReshapeParam>();
-  param.output->mutable_data<float16>();
-}
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(box_coder,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::BoxCoderCompute,
-                     def)
-    .BindInput("PriorBox",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("PriorBoxVar",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("TargetBox",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("OutputBox",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
--- a/lite/kernels/fpga/box_coder_compute.h
+++ b/lite/kernels/fpga/box_coder_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-class BoxCoderCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::BoxCoderParam;
-  void Run() override;
-  virtual ~BoxCoderCompute() = default;
-};
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -33,13 +33,6 @@ void CalibComputeFp32ToFP16::Run() {
  const auto* din = param.input->data<float>();
  param.output->mutable_data<float16>();
  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
-  // for (int i = 0; i < param.input->numel(); ++i) {
-  //   dout[i] = zynqmp::float_to_half(din[i]);
-  // }
-  param.input->ZynqTensor()->saveToFile("calib_input.txt");
-  param.output->ZynqTensor()->saveToFile("ouput_31.txt");
-  param.output->ZynqTensor()->printScale("calib");
  auto out_lod = param.output->mutable_lod();
  *out_lod = param.input->lod();
  return;
@@ -53,13 +46,7 @@ void CalibComputeFP16ToFp32::Run() {
  auto& param = this->Param<operators::CalibParam>();
  const auto* din = param.input->data<float16>();
  auto* dout = param.output->mutable_data<float>();
-  // for (int i = 0; i < param.input->numel(); ++i) {
-  //   dout[i] = zynqmp::half_to_float(din[i]);
-  // }
  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
-  param.output->ZynqTensor()->saveToFile("ouput_13.txt");
  auto out_lod = param.output->mutable_lod();
  *out_lod = param.input->lod();
  return;

--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/fpga/concat_compute.h"
 #include <string>
 #include <vector>
+#include "lite/kernels/fpga/concat_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -43,8 +45,10 @@ void ConcatCompute::PrepareForRun() {
 void ConcatCompute::Run() {
  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
  zynqmp::ConcatParam& concat_param = pe_.param();
-  concat_param.output->saveToFile("concat", true);
+  Debugger.get_instance()::registerOutput("concat", concat_param.output);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/fpga/conv_compute.h"
 #include <vector>
+#include "lite/kernels/fpga/conv_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -60,14 +63,9 @@ void ConvCompute::PrepareForRun() {
    fill_scale_bias_const(&conv_param);
    if (param.bias != nullptr) {
      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-      std::cout << "copy bias \n";
    }
    conv_param.relu.enabled = param.fuse_relu;
-    // conv_param.filter->saveToFile("filter", true);
-    // conv_param.bias()->saveToFile("bias", true);
-    // conv_param.scale()->saveToFile("scale", true);
    conv_pe_.init();
    conv_pe_.apply();
  }
@@ -75,18 +73,15 @@ void ConvCompute::PrepareForRun() {
 void ConvCompute::Run() {
  auto& param = this->Param<param_t>();
-  // std::cout << "in:" << param.x->ZynqTensor()->data<void>() << std::endl;
  if (param.x->ZynqTensor()->shape().channel() != 1 &&
      param.groups == param.x->ZynqTensor()->shape().channel()) {
    dw_conv_pe_.dispatch();
-    // param.output->ZynqTensor()->saveToFile("dw", true);
  } else {
-    zynqmp::ConvParam& conv_param = conv_pe_.param();
    conv_pe_.dispatch();
-    // conv_param.input->saveToFile("_conv_in", true);
+#ifdef FPGA_PRINT_TENSOR
-    conv_param.output->printScale("conv");
+  zynqmp::ConvParam& conv_param = conv_pe_.param();
-    param.output->ZynqTensor()->saveToFile("_conv", true);
+  Debugger::get_instance().registerOutput("conv", conv_param.output);
-    // conv_param.output->saveToFile("_conv_param", true);
+#endif
  }
 }

--- a/lite/kernels/fpga/dropout_compute.cc
+++ b/lite/kernels/fpga/dropout_compute.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/fpga/dropout_compute.h"
 #include <string>
+#include "lite/kernels/fpga/dropout_compute.h"
 #include "lite/backends/fpga/KD/float16.hpp"
-// #include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
@@ -54,12 +54,11 @@ void DropoutCompute::PrepareForRun() {
 }
 void DropoutCompute::Run() {
-  auto& param = Param<operators::DropoutParam>();
-  zynqmp::ScaleParam& scale_param = pe_.param();
-  // scale_param.input->saveToFile("drop_in.txt");
  pe_.dispatch();
-  // scale_param.output->saveToFile("drop_out.txt");
+#ifdef FPGA_PRINT_TENSOR
-  // std::cout << "prob:" << param.dropout_prob << std::endl;
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  Debugger::get_instance().registerOutput("dropout", scale_param.output);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/fpga/elementwise_compute.h"
 #include <string>
+#include "lite/kernels/fpga/elementwise_compute.h"
 #include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
@@ -39,8 +40,10 @@ void ElementwiseAddCompute::PrepareForRun() {
 }
 void ElementwiseAddCompute::Run() { 
  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
-  // ew_param.output->saveToFile("ew", true);
+  Debugger::get_instance().registerOutput("ew_add", ew_param.output);
+#endif
 }
 void ElementwiseAddActivationCompute::PrepareForRun() {
@@ -59,6 +62,10 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
 }
 void ElementwiseAddActivationCompute::Run() { 
  pe_.dispatch(); 
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_add", ew_param.output);
+#endif
 }
 void ElementwiseMulCompute::PrepareForRun() {
@@ -66,14 +73,8 @@ void ElementwiseMulCompute::PrepareForRun() {
  auto& param = Param<operators::ElementwiseParam>();
  param.Out->mutable_data<float16>();
  scale_param.input = param.X->ZynqTensor();
  scale_param.output = param.Out->ZynqTensor();
-  // param.Y->ZynqTensor()->saveToFile("scale_y", true);
-  std::cout << "y_production:" << param.Y->dims().production() << std::endl;
-  // exit(-1);
  scale_param.relu.enabled = false;
@@ -85,39 +86,26 @@ void ElementwiseMulCompute::PrepareForRun() {
  zynqmp::Shape shape(zynqmp::N, {channel});
  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
  float scale_value = param.Y->data<float>()[0];;
-  std::cout << "scale_value:" << scale_value << std::endl;
-  std::cout << "channel:" << channel << std::endl;
-  std::cout << "data_type:" << param.Y->ZynqTensor()->dataType() << std::endl;
-  // exit(-1);
  for (int i = 0; i < channel; ++i) {
    if (param.Y->dims().production() != 1) {
      scale_value = param.Y->ZynqTensor()->data<float>()[i];
    } 
    scale_data[i] = scale_value;
    bias_data[i] = 0;
  }
  pe_.init();
  pe_.apply();
-  // scale_param.input->saveToFile("scale_input", true);
-  // scale_param.scale->saveToFile("scale_scale", true);
-  param.Y->ZynqTensor()->saveToFile("ew_y", true);
-  // exit(-1);
 }
 void ElementwiseMulCompute::Run() { 
  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
  zynqmp::ScaleParam& scale_param = pe_.param();
-  // scale_param.output->saveToFile("ew_mul", true);
+  Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
-  // exit(-1);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/fc_compute.cc
+++ b/lite/kernels/fpga/fc_compute.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/fpga/fc_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
@@ -30,7 +31,6 @@ void FcCompute::PrepareForRun() {
  zynqmp::FullyConnectedParam& fc_param = pe_.param();
  param.output->mutable_data<float16>();
  fc_param.input = param.input->ZynqTensor();
  fc_param.output = param.output->ZynqTensor();
  fc_param.filter = param.w->ZynqTensor();
@@ -42,8 +42,10 @@ void FcCompute::PrepareForRun() {
 void FcCompute::Run() {
  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
  zynqmp::FullyConnectedParam& fc_param = pe_.param();
-  // fc_param.output->saveToFile("fc", true);
+  Debugger::get_instance().registerOutput("fc", fc_param.output);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/fpga/feed_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
@@ -37,24 +38,17 @@ void FeedCompute::PrepareForRun() {
 }
 void FeedCompute::Run() {
-  std::cout << "================= FeedCompute ================= \n";
  auto& param = this->Param<param_t>();
  Tensor& x = param.feed_list->at(param.col);
-  zynqmp::InputParam& feed_param = pe_.param();
-  if (x.dims().production() == 7590) {
-    feed_param.input->readFromFile("position_encoding.data");
-    feed_param.input->saveToFile("read.txt");
-  }
  pe_.dispatch();
  auto out_lod = param.out->mutable_lod();
  *out_lod = x.lod();
-  feed_param.input->saveToFile("feed_in.txt");
+#ifdef FPGA_PRINT_TENSOR
-  feed_param.output->saveToFile("feed.txt");
+  zynqmp::InputParam& feed_param = pe_.param();
+  Debugger::get_instance().registerOutput("feed", feed_param.output);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -14,6 +14,7 @@
 #include "lite/kernels/fpga/fetch_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
@@ -25,7 +26,7 @@ using float16 = zynqmp::float16;
 void FetchCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  // ====================================================
-  zynqmp::OutputParam& conv_param = pe_.param();
+  zynqmp::OutputParam& fetch_param = pe_.param();
  auto fetch_list = param.fetch_list;
  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
    fetch_list->resize(param.col + 1);
@@ -34,8 +35,8 @@ void FetchCompute::PrepareForRun() {
  out.Resize(param.input->dims());
  out.mutable_data<float>();
-  conv_param.input = param.input->ZynqTensor();
+  fetch_param.input = param.input->ZynqTensor();
-  conv_param.output = out.ZynqTensor();
+  fetch_param.output = out.ZynqTensor();
  pe_.init();
  pe_.apply();
@@ -44,8 +45,11 @@ void FetchCompute::PrepareForRun() {
 void FetchCompute::Run() {
  pe_.dispatch();
  auto& param = this->Param<param_t>();
-  zynqmp::OutputParam& conv_param = pe_.param();
-  conv_param.output->saveToFile("fetch", true);
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::OutputParam& fetch_param = pe_.param();
+  Debugger::get_instance().registerOutput("fetch", fetch_param.output);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/mul_compute.cc
+++ b/lite/kernels/fpga/mul_compute.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/fpga/mul_compute.h"
 #include <vector>
-// #include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/fpga/mul_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -37,7 +38,6 @@ void MulCompute::PrepareForRun() {
  fc_param.output = param.output->ZynqTensor();
  fc_param.filter = param.y->ZynqTensor();
-  // fc_param.bias = param.bias->ZynqTensor();
  fc_param.bias = &bias_;
  int channel = fc_param.filter->shape().channel();
@@ -59,15 +59,7 @@ void mul(MulCompute* k) {
  int fn = param.y->dims()[1];
-  std::cout << "num: " << num << std::endl;
-  std::cout << "channel: " << channel << std::endl;
-  std::cout << "fn: " << fn << std::endl;
-  param.y->ZynqTensor()->saveToFile("filter.txt");
  float16* out_data = param.output->mutable_data<float16>();
-  // int si = 0;
  int g_index = 0;
  for (int n = 0; n < 1; n++) {
@@ -77,12 +69,10 @@ void mul(MulCompute* k) {
      for (int c = 0; c < channel; c++) {
        float value = zynqmp::half_to_float(param.x->data<float16>()[si]);
        int index = c * fn + on;
-        // std::cout << "index: " << index << std::endl; 
        float weight = param.y->data<float>()[index];
        sum += value * weight;
        si++;
      }
-      std::cout << sum << "\n";
      out_data[g_index] = zynqmp::float_to_half(sum);
      g_index++;
    }
@@ -91,37 +81,12 @@ void mul(MulCompute* k) {
 void MulCompute::Run() {
-  // auto& param = Param<param_t>();
-  zynqmp::FullyConnectedParam& fc_param = pe_.param();
-  std::cout << "1\n";
-  // fc_param.input->readFromFile("arm_8_im_in.data");
-  // fc_param.input->flush();
-  float16* data_in = fc_param.input->data<float16>();
-  // float16 one = zynqmp::float_to_half(1.0f);
-  // for (int i = 0; i < fc_param.input->shape().alignedElementCount(); i++) {
-  //   data_in[i] = one;
-  // }
-  // fc_param.input->scale()[0] = 1.0 / 127;
-  // fc_param.input->scale()[1] = 127;
  pe_.dispatch();
-  // std::cout << "2\n";
-  // fc_param.input->printScale("mul");
+#ifdef FPGA_PRINT_TENSOR
-  // std::cout << "3\n";
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
-  fc_param.input->saveToFile("mul_in.txt");
+  Debugger.get_instance().registerOutput("mul", fc_param.output);
-  // std::cout << "4\n";
+#endif
-  // mul(this);
-  // std::cout << "5\n";
-  fc_param.output->saveToFile("mul_out.txt");
-  // exit(-1);
-  // exit(-1);
-  // fc_param.output->saveToFile("mul.txt");
-  // Tensor* output = const_cast<Tensor*>(param.output);
-  // const auto* x_data = param.x->data<float>();
-  // param.y->mutable_data<float16>();
-  // param.output->mutable_data<float16>();
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -195,17 +195,13 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
  T score_threshold = static_cast<T>(param.score_threshold);
  int num_det = 0;
  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-  // scores.ZynqTensor()->saveToFile("nms_scores", true);
  for (int64_t c = 0; c < class_num; ++c) {
    Tensor bbox_slice, score_slice;
    if (c == background_label) continue;
    if (scores_size == 3) {
      scores.Slice<T>(score_slice, c, c + 1);
-      // score_slice.ZynqTensor()->saveToFile("nms_slice", true);
      bbox_slice = bboxes;
    } else {
      score_slice.Resize({scores.dims()[0], 1});
@@ -387,27 +383,19 @@ void MulticlassNmsCompute::Run() {
      if (e > s) {
        Tensor out;
        outs->Slice<float>(out, s, e);
-        // scores_slice.ZynqTensor()->saveToFile("scores_slice", true);
        MultiClassOutput<float>(
            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
-        out.ZynqTensor()->saveToFile("out", true); 
        outs->ZynqTensor()->copyFrom(out.ZynqTensor());
      }
    }
  }
-  // save_tensor(param.scores, "_scores.txt", false);
-  // save_tensor(param.bboxes, "_bboxes.txt", false);
-  boxes->ZynqTensor()->saveToFile("_boxes", true);
-  scores->ZynqTensor()->saveToFile("_scores", true);
-  outs->ZynqTensor()->saveToFile("_outs", true);
  LoD lod;
  lod.emplace_back(batch_starts);
  outs->set_lod(lod);
+#ifdef FPGA_PRINT_TENSOR
+  Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
+#endif
 }
 }  // namespace host
 }  // namespace kernels

--- a/lite/kernels/fpga/norm_compute.cc
+++ b/lite/kernels/fpga/norm_compute.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 #include "lite/kernels/fpga/norm_compute.h"
-// #include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
@@ -27,7 +26,6 @@ void NormCompute::PrepareForRun() {
  auto& param = this->Param<operators::NormParam>();
  param.Out->mutable_data<float16>();
  zynqmp::NormParam& norm_param = pe_.param();
  norm_param.input = param.X->ZynqTensor();
  norm_param.output = param.Out->ZynqTensor();
@@ -39,20 +37,10 @@ void NormCompute::PrepareForRun() {
 void NormCompute::Run() {
  pe_.dispatch();
-  pe_.param().output->saveToFile("norm.txt", true);
+#ifdef FPGA_PRINT_TENSOR
-  // auto& ctx = this->ctx_->template As<ARMContext>();
+  zynqmp::NormParam& norm_param = pe_.param();
-  // auto& param = this->Param<operators::NormParam>();
+  Debugger::get_instance().registerOutput("norm", norm_param.output);
+#endif
-  // auto input_dims = param.X->dims();
-  // int dim_size = param.X->dims().size();
-  // auto axis = (param.axis < 0) ? param.axis + dim_size : param.axis;
-  // const auto* x_data = param.X->data<float>();
-  // auto* o_data = param.Out->mutable_data<float>();
-  // int pre_n = input_dims.count(0, axis);
-  // int post_n = input_dims.count(axis + 1, dim_size);
-  // int n = input_dims[axis];
-  // lite::arm::math::norm(x_data, pre_n, n, post_n, param.epsilon, o_data, &ctx);
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/fpga/pooling_compute.h"
 #include <string>
 #include <vector>
+#include "lite/kernels/fpga/pooling_compute.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -47,9 +49,10 @@ void PoolCompute::PrepareForRun() {
 void PoolCompute::Run() {
  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
  zynqmp::PoolingParam& pool_param = pe_.param();
-  pool_param.output->printScale("pooling");
+  Debugger::get_instance().registerOutput("pooling", pool_param.output);
-  pool_param.output->saveToFile("pool", true);
+#endif
 }
 }  // namespace fpga

--- a/lite/kernels/fpga/while_compute.cc
+++ b/lite/kernels/fpga/while_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/fpga/while_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-void WhileCompute::PrepareForRun() {
-  auto &param = Param<operators::WhileParam>();
-  auto cur_scope = param.scope;
-  executor_ =
-      std::make_shared<StepExecutor>(param.sub_block, cur_scope, place());
-}
-void WhileCompute::Run() {
-  auto &param = Param<operators::WhileParam>();
-  while (param.cond->data<bool>()[0]) {
-    executor_->Run();
-  }
-}
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(
-    while, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::WhileCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Condition",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out",{LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
--- a/lite/kernels/fpga/while_compute.h
+++ b/lite/kernels/fpga/while_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/while_op.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-class StepExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
- public:
-  StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      VLOG(LOG_INFO) << "while: creating Op [" << op_type << "]";
-      op_handler->Attach(op_desc, scope);
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-  void Run() {
-    for (auto &op_handler : ops_of_block_) {
-      // VLOG(4) << op_handler->op_info()->Repr();
-      op_handler->InferShape();
-      // VLOG(4) << "while: infered shape";
-      op_handler->Run();
-    }
-  }
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-class WhileCompute 
-  : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::WhileParam;
-  void Run() override;
-  void PrepareForRun() override;
-  virtual ~WhileCompute() = default;
- private:
-  std::shared_ptr<StepExecutor> executor_;
-};
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
--- a/lite/kernels/fpga/write_to_array_compute.cc
+++ b/lite/kernels/fpga/write_to_array_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/fpga/write_to_array_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-void WriteToArrayCompute::PrepareForRun() {}
-void WriteToArrayCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::WriteToArrayParam>();
-  CHECK_EQ(param.I->numel(), 1) << "input2 should have only one element";
-  const auto* x_data = param.X->data<float>();
-  int id = param.I->data<int>()[0];
-  int id_test = param.I->data<int64_t>()[0];
-  if (id >= param.Out->size()) {
-    for (int i = param.Out->size(); i < id + 1; i++) {
-      lite::Tensor tmp;
-      param.Out->push_back(tmp);
-    }
-  }
-  (*param.Out)[id].Resize(param.X->dims());
-  auto out_lod = (*param.Out)[id].mutable_lod();
-  *out_lod = param.X->lod();
-  auto* o_data = (*param.Out)[id].mutable_data<float>(TARGET(kHost));
-  int input_size = param.X->numel();
-  memcpy(o_data, x_data, sizeof(float) * input_size);
-}
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(write_to_array,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::WriteToArrayCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .Finalize();
--- a/lite/kernels/fpga/write_to_array_compute.h
+++ b/lite/kernels/fpga/write_to_array_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-class WriteToArrayCompute 
-	: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::WriteToArrayParam;
-  void PrepareForRun() override;
-  void Run() override;
-  ~WriteToArrayCompute() {}
- private:
-};
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle