format code

51d144f3 · chonwhite · 4dddc907 · 51d144f3 · 51d144f3 · 51d144f3
21 changed file
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -205,8 +205,8 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
-   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
-   #    DEPS ${lite_model_test_DEPS})
+   lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
+      DEPS ${lite_model_test_DEPS})

   # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
   #    DEPS ${lite_model_test_DEPS}

--- a/lite/api/ocr_attention_test_fpga.cc
+++ b/lite/api/ocr_attention_test_fpga.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_file, "", "input_file");
+
+namespace paddle {
+namespace lite {
+
+// float* temp_data = new float(33 * 10 * 23);
+
+// std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
+//   std::vector<std::string> files;
+//   std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
+//                                      [](DIR* dir) { dir&& closedir(dir); });
+//   struct dirent* dirent_ptr;
+//   if (!directory_ptr) {
+//     std::cout << "Error opening : " << std::strerror(errno) << dir <<
+//     std::endl;
+//     return files;
+//   }
+
+//   while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
+//     files.push_back(std::string(dirent_ptr->d_name));
+//   }
+//   return files;
+// }
+
+void read_from_file(const std::string& path, float* data, int num) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    exit(-1);
+    return;
+  }
+
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+}
+
+void chw_to_hwc(float* src, float* dst, int channel, int height, int width) {
+  int amount_per_row = width * channel;
+  int index = 0;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        int dst_index = offset_height + w * channel + c;
+        dst[dst_index] = src[index];
+        index = index + 1;
+      }
+    }
+  }
+}
+
+void TestModel(const std::vector<Place>& valid_places,
+               const Place& preferred_place,
+               bool use_npu = false) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+
+  // predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
+  predictor.Build("", "attention/model", "attention/params", valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  // input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 48, 512})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 100, 200})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  // std::ifstream file_stream(FLAGS_input_file);
+  // // file_stream.open(path);
+  // if (!file_stream.good()) {
+  //   std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
+  //   exit(-1);
+  //   return;
+  // }
+
+  // read_from_file("n7cu17.data", data, 100 * 200);
+  read_from_file(FLAGS_input_file, data, 100 * 200);
+  // read_from_file("t.data", data, 48 * 512);
+
+  // for (int i = 0;i < 48 * 512;i++ ) {
+  //   std::cout << ":" << data[i] << std::endl;
+  // }
+
+  //=============================================
+  auto* init_ids = predictor.GetInput(1);
+  init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_ids = init_ids->mutable_data<float>();
+  auto ids_size = init_ids->dims().production();
+  for (int i = 0; i < ids_size; i++) {
+    data_ids[i] = 0;
+  }
+  auto lod_ids = init_ids->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
+  *lod_ids = lod_i;
+
+  //=============================================
+  auto* init_scores = predictor.GetInput(2);
+  init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
+  auto* data_scores = init_scores->mutable_data<float>();
+  auto scores_size = input_tensor->dims().production();
+  for (int i = 0; i < scores_size; i++) {
+    data_scores[i] = 0;
+  }
+  auto lod_scores = init_scores->mutable_lod();
+  std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
+  *lod_scores = lod_s;
+
+  //=============================================
+  auto* position_encoding = predictor.GetInput(3);
+  position_encoding->Resize(
+      DDim(std::vector<DDim::value_type>({1, 33, 10, 23})));
+  auto* position_encoding_data = position_encoding->mutable_data<float>();
+
+  float* temp_data = position_encoding_data;
+
+  std::cout << "====================== 1\n";
+
+  for (int i = 0; i < position_encoding->dims().production(); ++i) {
+    temp_data[i] = 0;
+  }
+  std::cout << "====================== 2\n";
+  int index = 0;
+  for (int i = 0; i < 10; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == row) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  std::cout << "====================== 3\n";
+  for (int i = 0; i < 23; i++) {
+    for (int row = 0; row < 10; row++) {
+      for (int col = 0; col < 23; col++) {
+        if (i == col) {
+          temp_data[index] = 1.0f;
+        } else {
+          temp_data[index] = 0.0f;
+        }
+        index++;
+      }
+    }
+  }
+  std::cout << "====================== 4\n";
+  // chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23);
+  // delete[] temp_data;
+
+  // read_from_file("position_encoding.data", position_encoding_data, 33 * 10 *
+  // 23);
+  // position_encoding->ZynqTensor()->readFromFile("position_encoding.data");
+
+  // exit(-1);
+
+  // for (int i = 0; i < FLAGS_warmup; ++i) {
+  //   predictor.Run();
+  // }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  std::cout << "================== Speed Report ===================";
+  std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  //  std::vector<std::vector<float>> results;
+  //  // i = 1
+  //  results.emplace_back(std::vector<float>(
+  //      {0.00019130898, 9.467885e-05,  0.00015971427, 0.0003650665,
+  //       0.00026431272, 0.00060884043, 0.0002107942,  0.0015819625,
+  //       0.0010323516,  0.00010079765, 0.00011006987, 0.0017364529,
+  //       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
+  //       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
+  auto* out = predictor.GetOutput(0);
+
+  //  ASSERT_EQ(out->dims().size(), 2);
+  //  ASSERT_EQ(out->dims()[0], 1);
+  //  ASSERT_EQ(out->dims()[1], 1000);
+  //
+  //  int step = 50;
+  for (int i = 0; i < 10; i++) {
+    // std::cout << ":" << out->data<float>()[i] << std::endl;
+  }
+  //  for (int i = 0; i < results.size(); ++i) {
+  //    for (int j = 0; j < results[i].size(); ++j) {
+  //      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+  //                  results[i][j],
+  //                  1e-6);
+  //    }
+  //  }
+
+  std::string file = "plate_data/" + FLAGS_input_file.substr(9);
+  std::cout << "file:::" << file << std::endl;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+TEST(OcrAttention, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  // Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+
+  TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)});
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
@@ -20,14 +20,8 @@ namespace zynqmp {
 DLEngine::DLEngine() {
  open_device();
  int ret = get_device_info(info_);
-  // filter::set_filter_capacity(2048);
  filter::set_filter_capacity(info_.filter_cap);
  filter::set_colunm(info_.colunm);
-
-  std::cout << " version:" << info_.version;
-  std::cout << " device_type:" << info_.device_type;
-  std::cout << " filter_cap:" << info_.filter_cap;
-  std::cout << " colunm:" << info_.colunm << std::endl;
 }

 }  // namespace zynqmp

--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -30,8 +30,7 @@ class DLEngine {

  DeviceInfo& deviceInfo();

-  // bool isZU3() { return info_.device_type / 100 == 3; }
-  bool isZU3() { return true; }
+  bool isZU3() { return info_.device_type / 100 == 3; }

  float* out_data = nullptr;


--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -61,8 +61,6 @@ void reset_device() {

 // memory management;
 void *fpga_malloc(size_t size) {
-// std::cout << "fpga malloc: 0x" << std::hex << size  << std::dec << "  (" <<
-// size << ") - ";
 #ifdef ENABLE_DEBUG
 // std::cout << "fpga_malloc:" << size << std::endl;
 #endif
@@ -73,7 +71,6 @@ void *fpga_malloc(size_t size) {
    std::cout << "not enough memory !";
    exit(-1);
  }
-  // std::cout << std::hex << ptr << std::dec << std::endl;
  memory_map.insert(std::make_pair(ptr, size));
  memory_size += size;
  if (memory_size > memory_size_max) {
@@ -91,8 +88,6 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }

 size_t fpga_diagnose_memory(int detailed) {
  size_t total = 0;
-  //        size_t size = 0;
-  //        int i = 0;
  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
  while (iter != memory_map.end()) {
    total += iter->second;
@@ -108,11 +103,8 @@ void fpga_free(void *ptr) {
    size = iter->second;
    memory_map.erase(iter);
  }
-
  memory_size -= size;
-
 #ifdef PADDLE_OS_LINUX
-
  munmap(ptr, size);
 #else
  free(ptr);
@@ -129,9 +121,6 @@ int fpga_flush(void *address, size_t size) {
 }

 int fpga_invalidate(void *address, size_t size) {
-  // std::cout <<
-  // "=================================================================================="
-  // << std::endl;
  struct MemoryCacheArgs args;
  args.address = address;
  args.size = size;
@@ -162,84 +151,21 @@ int fpga_reset() {
 }

 int ioctl_conv(const struct ConvArgs &args) {
-#ifdef ENABLE_DEBUG
-//        std::cout << "======Compute Basic Conv======";
-//        std::cout << "   relu_enabled:" << args.relu_enabled
-//       << "   sb_address:" << args.sb_address
-//       << "   filter_address:" << args.filter_address
-//       << "   filter_num:" << args.filter_num
-//       << "   group_num:" << args.group_num;
-//  std::cout << "   image_address:" << args.image.address
-//       << "   image_scale_address:" << args.image.scale_address
-//       << "   image_channels:" << args.image.channels
-//       << "   image_height:" << args.image.height
-//       << "   image_width:" << args.image.width
-//       << "   pad_height:" << args.image.pad_height
-//       << "   pad_width:" << args.image.pad_width;
-//  std::cout << "   kernel_height:" << args.kernel.height
-//       << "   kernel_width:" << args.kernel.width
-//       << "   stride_h:" << args.kernel.stride_h
-//       << "   stride_w:" << args.kernel.stride_w;
-//  std::cout << "   out_address:" << args.output.address
-//       << "   out_scale_address:" << args.output.scale_address;
-//
-//       float* in_scale = (float*)args.image.scale_address;
-//       std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
-//       std::endl;
-
-#endif
-
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-
-  // return 0;
 }

 int compute_fpga_conv_basic(const struct ConvArgs &args) {
-#ifdef ENABLE_DEBUG
-
-//        std::cout << "======Compute Basic Conv======";
-//        std::cout << "   relu_enabled:" << args.relu_enabled
-//       << "   sb_address:" << args.sb_address
-//       << "   filter_address:" << args.filter_address
-//       << "   filter_num:" << args.filter_num
-//       << "   group_num:" << args.group_num;
-//  std::cout << "   image_address:" << args.image.address
-//       << "   image_scale_address:" << args.image.scale_address
-//       << "   image_channels:" << args.image.channels
-//       << "   image_height:" << args.image.height
-//       << "   image_width:" << args.image.width
-//       << "   pad_height:" << args.image.pad_height
-//       << "   pad_width:" << args.image.pad_width;
-//  std::cout << "   kernel_height:" << args.kernel.height
-//       << "   kernel_width:" << args.kernel.width
-//       << "   stride_h:" << args.kernel.stride_h
-//       << "   stride_w:" << args.kernel.stride_w;
-//  std::cout << "   out_address:" << args.output.address
-//       << "   out_scale_address:" << args.output.scale_address;
-
-// float *in_scale = (float *)args.image.scale_address;
-//        std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
-//        std::endl;
-
-// float *filter_scale = (float *)args.filter_scale_address;
-//        std::cout << " filter scale:" << filter_scale[0] << "," <<
-//        filter_scale[1] << std::endl;
-
-#endif
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }

 int compute_fpga_conv(const struct SplitConvArgs &args) {
-  // return do_ioctl(IOCTL_CONFIG_CONV, &args);
  int split_num = args.split_num;
  int ret = -1;
  for (int i = 0; i < split_num; i++) {
-    // ComputeBasicConv(args.conv_args[i]);
    ret = compute_fpga_conv_basic(args.conv_arg[i]);
  }

  if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
    exit(-1);
  }
  return ret;
@@ -254,10 +180,7 @@ int compute_fpga_ewadd(const struct EWAddArgs &args) {
 }

 int get_device_info(const struct DeviceInfo &args) {
-  // DeviceInfo info;
-  // struct DeviceInfo* a = &info;
  int ret = do_ioctl(IOCTL_DEVICE_INFO, &args);
-  // std::cout << "a." << a->filter_cap << std::endl;
  return ret;
 }

@@ -299,7 +222,6 @@ int perform_bypass(const struct BypassArgs &args) {
  }

  int remainder = size - max_size * count;
-  // std::cout << "remainder:" << remainder << std::endl;
  if (remainder > 0) {
    bypassArgs.image.channels = remainder;
    bypassArgs.image.address =
@@ -309,7 +231,6 @@ int perform_bypass(const struct BypassArgs &args) {
    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
    scale = std::max(scale, scales[0]);
  }
-
  args.output.scale_address[0] = scale;
  args.output.scale_address[1] = 1.0f / scale;
  return ret;
@@ -318,52 +239,10 @@ int perform_bypass(const struct BypassArgs &args) {
 int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }

 int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
 }

 int compute_fpga_dwconv(const struct DWconvArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-// float *in_scale = (float *)args.image.scale_address;
-// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
-// std::endl;
-#endif
  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
 }


--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -130,9 +130,6 @@ class ConvPE : public PE {
                  wi = w - wstart;
                }
                const int index = (h * image_width + w) * image_channels + c;
-                //                          int weight_index = (hi *
-                //                          kernel_width + wi) * image_channels
-                //                          + c;//TODO
                int weight_index = oc * filter_chw +
                                   kernel_width * kernel_height * c +
                                   kernel_width * hi + wi;
@@ -141,8 +138,6 @@ class ConvPE : public PE {
              }
            }
          }
-          // std::cout << " ============================= pool_index:" <<
-          // pool_index << " sum:" << sum << std::endl;

          if (param_.relu.enabled && sum < 0) {
            sum = -sum;
@@ -171,13 +166,6 @@ class ConvPE : public PE {
    float_input.copyFrom(input);
    float_input.syncToCPU();

-    // float_input.saveToFile("input", true);
-    // param_.filter->saveToFile("filter", true);
-    // param_.bias()->saveToFile("bias", true);
-
-    // exit(-1);
-
-    // float16* data_out = output->data<float16>();
    float* out = float_output.mutableData<float>(FP32, output->shape());

    float* bias_data = param_.bias()->data<float>();
@@ -205,14 +193,8 @@ class ConvPE : public PE {
            int image_index = h * out_width * in_channel + w * in_channel + j;
            float value = image_addr[image_index] * filter_ptr[j];
            sum += value;
-
-            // mi[j] = value;
          }

-          // for (int j = 0; j < in_channel; j++) {
-          //   sum += mi[j];
-          // }
-
          sum += bias_data[i];

          if (param_.relu.enabled && sum < 0) {
@@ -232,10 +214,6 @@ class ConvPE : public PE {
    output->copyFrom(&float_output);
    output->scale()[0] = max / 127;
    output->scale()[1] = 127 / max;
-
-    // float_output.saveToFile("out", true);
-
-    // exit(-1);
  }

  bool dispatch() {
@@ -264,7 +242,6 @@ class ConvPE : public PE {
    std::vector<BasicConvParam*>& params = param_.splitParams();
    int ret = 0;
    for (auto conv_param : params) {
-      // conv_param->input.printScale();
      ret |= compute_fpga_conv_basic(conv_param->args);
    }

@@ -282,34 +259,16 @@ class ConvPE : public PE {

    size_t size = params.size();
    if (split_axis == 0 && ret == 0 && size > 1) {
-      // std::cout << "concat size:" << size << std::endl;
      concatPE_.dispatch();
    }
    if (split_axis == 1 && ret == 0 && size > 1) {
-      // for (int n = 0; n < size - 1; n++) {
      ElementwiseAddParam& add_param = addPE_.param();
      add_param.inputs = {&params[0]->output, &params[1]->output};
      add_param.output = param_.output;
      addPE_.init();
      addPE_.apply();
      addPE_.dispatch();
-
-      // param_.output->printScale();
-
-      // params[0]->input.saveToFile("conv_1.txt");
-      // params[1]->input.saveToFile("conv_2.txt");
-
-      // params[0]->output.saveToFile("ew_o1.txt");
-      // params[1]->output.saveToFile("ew_o2.txt");
-      // std::cout << "\n ================== EW ================== \n";
-      // }
    }
-
-    if (param_.input->shape().channel() == 64 &&
-        param_.output->shape().channel() == 128) {
-      // exit(-1);
-    }
-
    return ret == 0;
  }


--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -212,7 +212,6 @@ inline void format_filter(Tensor* filter,

  for (size_t i = 0; i < max_values.size(); i++) {
    scales.push_back(max_values[i] / max_value);
-    // scales.push_back(1.0f);
  }

  // filter->saveToFile("filter.txt");
@@ -345,10 +344,8 @@ inline void split_filter_num(const ConvParam& c_param) {
    Shape s_shape(N, {filter_num});
    float* scale_data = scale.mutableData<float>(FP32, s_shape);
    float* bias_data = bias.mutableData<float>(FP32, s_shape);
-    // std::cout << "v size: " << v.size() << std::endl;
    for (int n = 0; n < filter_num; n++) {
      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
-      // scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
    }
    for (int n = 0; n < filter_num; n++) {
      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
@@ -366,8 +363,6 @@ inline void split_filter_num(const ConvParam& c_param) {
    // param.scale()->saveToFile("scale.txt");
    // param.bias()->saveToFile("bias.txt");

-    // exit(-1);
-
    args.group_num = param.groups;
    args.relu_enabled = param.relu.enabled;
    args.sb_address = conv_param->scaleBias.data<float>();
@@ -492,7 +487,6 @@ inline int fill_split_arg(const ConvParam& c_param) {
    split_filter_num(c_param);
    return 0;
  }
-  // split_filter_num(c_param);
 }

 inline bool compute_conv(const ConvParam& c_conv_params) {

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -114,8 +114,6 @@ class PoolingPE : public PE {
        for (int c = 0; c < image_channels; ++c) {
          const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
          float sum = 0;
-          // const int index =
-          //     (hstart * image_width + wstart) * image_channels + c;
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = (h * image_width + w) * image_channels + c;
@@ -144,9 +142,7 @@ class PoolingPE : public PE {
    Tensor float_input;
    float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
-    // float_input.saveToFile("pool_float.txt");
    float16* data_out = output->data<float16>();
-
    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];

    float scale_max = 0;
@@ -163,7 +159,6 @@ class PoolingPE : public PE {
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
    output->flush();
-    // exit(-1);
  }

  void cpu_compute() {
@@ -193,7 +188,6 @@ class PoolingPE : public PE {
    output->scale()[0] = scale_max / 127.0f;
    output->scale()[1] = 127.0f / scale_max;
    output->flush();
-    // exit(-1);
  }

  bool dispatch() {

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -43,81 +43,6 @@ class ScalePE : public PE {
    return true;
  }

-  // void apply() {
-  //   Tensor* input = param_.input;
-  //   Tensor* output = param_.output;
-  //   Shape& input_shape = input->shape();
-  //   int channel = input_shape.channel();
-  //   int repeat = 1;
-  //   int alignment = 16;
-  //   int length = channel;
-
-  //   if (channel % alignment != 0 || channel < alignment) {
-  //     int c_lcm = lcm(channel, alignment);
-  //     repeat = c_lcm / (channel);
-  //   }
-  //   Shape shape(N, {channel * repeat});
-  //   param_.alignedBias()->mutableData<float16>(FP16, shape);
-  //   param_.alignedScale()->mutableData<float16>(FP16, shape);
-
-  //   float16* bias_data = param_.alignedBias()->data<float16>();
-  //   float16* scale_data = param_.alignedScale()->data<float16>();
-
-  //   if (param_.bias != nullptr) {
-  //     float* bias_data_float = param_.bias->data<float>();
-  //     for (int i = 0; i < repeat; i++) {
-  //       for (int j = 0; j < length; j++) {
-  //         float16 value = float_to_half(bias_data_float[j]);
-  //         bias_data[i * length + j] = value;
-  //         // bias_data[i * length + j] = float_to_half(1.0f);
-  //       }
-  //     }
-  //   } else {
-  //     float16 zero = float_to_half(0.0f);
-  //     for (int i = 0; i < repeat; i++) {
-  //       for (int j = 0; j < length; j++) {
-  //         bias_data[i * length + j] = zero;
-  //       }
-  //     }
-  //   }
-
-  //   float* scale_data_float = param_.scale->data<float>();
-  //   for (int i = 0; i < repeat; i++) {
-  //     for (int j = 0; j < length; j++) {
-  //       float16 value = float_to_half(scale_data_float[j]);
-  //       scale_data[i * length + j] = value;
-  //     }
-  //   }
-
-  //   param_.alignedScale()->flush();
-  //   param_.alignedBias()->flush();
-
-  //   int wc = input_shape.width() * input_shape.channel();
-  //   int wc_aligned = align_image(wc);
-
-  //   ScaleArgs& args = param_.args;
-  //   args.scale_address = param_.alignedScale()->data<void>();
-  //   args.bias_address = param_.alignedBias()->data<void>();
-  //   args.wc_alignment = wc_aligned;
-  //   args.channel_alignment = channel * repeat;
-
-  //   args.image.address = input->data<void>();
-  //   args.image.scale_address = input->scale();
-  //   args.image.channels = channel;
-  //   args.image.height = input_shape.height();
-  //   args.image.width = input_shape.width();
-  //   args.image.pad_width = 0;
-  //   args.image.pad_height = 0;
-  //   args.output.address = output->data<void>();
-  //   args.output.scale_address = output->scale();
-  // }
-
-  // bool dispatch() {
-  //   param_.input->syncToDevice();
-  //   std::cout << "scale dispatch" << std::endl;
-  //   return compute_fpga_scale(param_.args) == 0;
-  // }
-
  void apply() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
@@ -241,8 +166,6 @@ class ScalePE : public PE {
      for (int c = 0; c < input->shape().channel(); c++) {
        int index = i * input->shape().channel() + c;
        float value = half_to_float(in_data[index]) * scale_data[c];
-        std::cout << "value:" << value << " = " << half_to_float(in_data[index])
-                  << " x " << scale_data[c] << std::endl;
        data_out[index] = float_to_half(value);

        if (value < 0) {
@@ -273,12 +196,6 @@ class ScalePE : public PE {
      dw_param.quantizedFilter()->flush();
      // apply();
    }
-    // param_.scale->saveToFile("scale.txt");
-    // cpu_compute();
-    // return true;
-    // param_.input->syncToDevice();
-    // return compute_fpga_scale(param_.args) == 0;
-
    param_.input->syncToDevice();
    return dw_pe_.dispatch();
  }

--- a/lite/kernels/arm/box_coder_compute.cc
+++ b/lite/kernels/arm/box_coder_compute.cc
@@ -221,10 +221,6 @@ void BoxCoderCompute::Run() {
      }
    }
  }
-
-  // prior_box->ZynqTensor()->saveToFile("prior_box", true);
-  // prior_box_var->ZynqTensor()->saveToFile("prior_box_var", true);
-  // output_box->ZynqTensor()->saveToFile("box_coder", true);
 }

 }  // namespace arm

--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -61,25 +61,9 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<ARMContext>();

-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.Out->template mutable_data<float>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.Out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.Out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
+    auto data = param.Out->template mutable_data<T>();
+    for (int i = 0; i < param.Out->numel(); i++) {
+      data[i] = param.value;
    }
  }


--- a/lite/kernels/arm/prior_box_compute.cc
+++ b/lite/kernels/arm/prior_box_compute.cc
@@ -85,9 +85,6 @@ void PriorBoxCompute::Run() {
                             is_clip,
                             order,
                             min_max_aspect_ratios_order);
-
-  param.boxes->ZynqTensor()->saveToFile("pb_boxes", true);
-  param.variances->ZynqTensor()->saveToFile("pb_variance", true);
 }

 }  // namespace arm
@@ -106,17 +103,3 @@ REGISTER_LITE_KERNEL(prior_box,
    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
-
-// REGISTER_LITE_KERNEL(prior_box,
-//                      kFPGA,
-//                      kFP16,
-//                      kNHWC,
-//                      paddle::lite::kernels::arm::PriorBoxCompute,
-//                      def)
-//     .BindInput("Input",{LiteType::GetTensorTy(
-//                    TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
-//     .BindInput("Image", {LiteType::GetTensorTy(
-//                    TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
-//     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
-//     .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
-//     .Finalize();
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -9,14 +9,14 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
 # add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
-add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
+# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
 # add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})

 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
-add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
-# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
+# add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
+add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
 add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
 add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})

--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -47,7 +47,7 @@ void ConcatCompute::Run() {
  pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
  zynqmp::ConcatParam& concat_param = pe_.param();
-  Debugger.get_instance()::registerOutput("concat", concat_param.output);
+  Debugger::get_instance().registerOutput("concat", concat_param.output);
 #endif
 }


--- a/lite/kernels/fpga/density_prior_box_compute.cc
+++ b/lite/kernels/fpga/density_prior_box_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/density_prior_box_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-// inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-//                                bool flip,
-//                                std::vector<float>* output_aspect_ratior) {
-//   constexpr float epsilon = 1e-6;
-//   output_aspect_ratior->clear();
-//   output_aspect_ratior->push_back(1.0f);
-//   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-//     float ar = input_aspect_ratior[i];
-//     bool already_exist = false;
-//     for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-//       if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-//         already_exist = true;
-//         break;
-//       }
-//     }
-//     if (!already_exist) {
-//       output_aspect_ratior->push_back(ar);
-//       if (flip) {
-//         output_aspect_ratior->push_back(1.0f / ar);
-//       }
-//     }
-//   }
-// }
-
-void DensityPriorBoxCompute::Run() {
-  // auto& param = Param<operators::DensityPriorBoxParam>();
-  // bool is_flip = param.flip;
-  // bool is_clip = param.clip;
-  // std::vector<float> min_size = param.min_sizes;
-  // std::vector<float> fixed_size = param.fixed_sizes;
-  // std::vector<float> fixed_ratio = param.fixed_ratios;
-  // auto density_size = param.density_sizes;
-  // std::vector<float> max_size = param.max_sizes;
-  // std::vector<float> aspect_ratio = param.aspect_ratios;
-  // std::vector<float> variance = param.variances_;
-  // int img_w = param.img_w;
-  // int img_h = param.img_h;
-  // float step_w = param.step_w;
-  // float step_h = param.step_h;
-  // float offset = param.offset;
-  // std::vector<float> aspect_ratios_vec;
-  // ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
-  // size_t prior_num = aspect_ratios_vec.size() * min_size.size();
-  // prior_num += max_size.size();
-  // if (fixed_size.size() > 0) {
-  //   prior_num = fixed_size.size() * fixed_ratio.size();
-  // }
-  // if (density_size.size() > 0) {
-  //   for (int i = 0; i < density_size.size(); ++i) {
-  //     if (fixed_ratio.size() > 0) {
-  //       prior_num += (fixed_ratio.size() * ((pow(density_size[i], 2)) - 1));
-  //     } else {
-  //       prior_num +=
-  //           ((fixed_ratio.size() + 1) * ((pow(density_size[i], 2)) - 1));
-  //     }
-  //   }
-  // }
-  // std::vector<std::string> order = param.order;
-
-  // lite::arm::math::density_prior_box(param.input,
-  //                                    param.image,
-  //                                    &param.boxes,
-  //                                    &param.variances,
-  //                                    min_size,
-  //                                    fixed_size,
-  //                                    fixed_ratio,
-  //                                    density_size,
-  //                                    max_size,
-  //                                    aspect_ratio,
-  //                                    variance,
-  //                                    img_w,
-  //                                    img_h,
-  //                                    step_w,
-  //                                    step_h,
-  //                                    offset,
-  //                                    prior_num,
-  //                                    is_flip,
-  //                                    is_clip,
-  //                                    order);
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(density_prior_box,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::DensityPriorBoxCompute,
-                     def)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Image",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Variances",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
--- a/lite/kernels/fpga/density_prior_box_compute.h
+++ b/lite/kernels/fpga/density_prior_box_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class DensityPriorBoxCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::DensityPriorBoxParam;
-
-  void Run() override;
-
-  virtual ~DensityPriorBoxCompute() = default;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -45,7 +45,6 @@ class IoCopyHostToFpgaCompute
    auto& param = Param<operators::IoCopyParam>();
    CHECK(param.x->target() == TARGET(kHost) ||
          param.x->target() == TARGET(kFPGA));
-    // param.y->CopyDataFrom(*param.x);
    param.y->mutable_data<float16>();
    if (param.x->ZynqTensor()->aligned() &&
        param.x->ZynqTensor()->shape().shouldAlign()) {
@@ -53,10 +52,8 @@ class IoCopyHostToFpgaCompute
      tempTensor.mutableData<float16>(zynqmp::FP16,
                                      param.x->ZynqTensor()->shape());
      tempTensor.copyFrom(param.x->ZynqTensor());
-      // tempTensor.saveToFile("tempTensor", true);
      tempTensor.setAligned(true);
      tempTensor.unalignImage();
-      // tempTensor.saveToFile("unaligned", true);
      param.y->ZynqTensor()->copyFrom(&tempTensor);
    } else {
      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
@@ -97,11 +94,9 @@ class IoCopyFpgaToHostCompute
    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
  void Run() override {
-    // std::cout << "IoCopyFpgaToHostCompute \n";
    auto& param = Param<operators::IoCopyParam>();
    CHECK(param.x->target() == TARGET(kHost) ||
          param.x->target() == TARGET(kFPGA));
-    // std::cout << "before CopyDataFrom \n";

    param.y->mutable_data<float>();
    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
@@ -113,10 +108,8 @@ class IoCopyFpgaToHostCompute
      tempTensor.mutableData<float16>(zynqmp::FP16,
                                      param.x->ZynqTensor()->shape());
      tempTensor.copyFrom(param.x->ZynqTensor());
-      // tempTensor.saveToFile("tempTensor", true);
      tempTensor.setAligned(true);
      tempTensor.unalignImage();
-      // tempTensor.saveToFile("unaligned", true);
      param.y->ZynqTensor()->copyFrom(&tempTensor);
    } else {
      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());

--- a/lite/kernels/fpga/layout_compute.cc
+++ b/lite/kernels/fpga/layout_compute.cc
@@ -29,11 +29,6 @@ using float16 = zynqmp::float16;
 template <typename T>
 void convert_to_hwc(
    T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
-  std::cout << "  -------------- chw -> HWC ---------------\n";
-  std::cout << "channel: " << channel << std::endl;
-  std::cout << "height: " << height << std::endl;
-  std::cout << "width: " << width << std::endl;
-
  int chw = channel * height * width;
  int wc = width * channel;
  int index = 0;
@@ -52,10 +47,6 @@ void convert_to_hwc(
 template <typename T>
 void hwc_to_chw(
    T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
-  std::cout << "  ============= HWC -> CHW =============\n";
-  std::cout << "channel: " << channel << std::endl;
-  std::cout << "height: " << height << std::endl;
-  std::cout << "width: " << width << std::endl;
  int chw = channel * height * width;
  int wc = width * channel;
  int wh = width * height;
@@ -73,10 +64,7 @@ void hwc_to_chw(
 }

 void TransHwcToChw(Tensor* dest, const Tensor* src) {
-  std::cout << "precision:" << static_cast<int>(src->precision()) << std::endl;
-  std::cout << "dataType:" << src->ZynqTensor()->dataType() << std::endl;
  if (src->ZynqTensor()->dataType() == zynqmp::FP32) {
-    std::cout << "float\n";
    float* chw = dest->mutable_data<float>();
    float* hwc = const_cast<float*>(src->data<float>());
    int num = dest->dims()[0];
@@ -94,7 +82,6 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) {
  }

  if (src->ZynqTensor()->dataType() == zynqmp::FP16) {
-    std::cout << "float16\n";
    float16* chw = dest->mutable_data<float16>();
    float16* hwc = const_cast<float16*>(src->data<float16>());
    int num = dest->dims()[0];
@@ -126,9 +113,6 @@ class TransHwcToChwCompute
    param.y->ZynqTensor()->flush();
    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());

-    // param.x->ZynqTensor()->saveToFile("src_hwc", true);
-    // param.y->ZynqTensor()->saveToFile("src_dst", true);
-
    auto out_lod = param.y->mutable_lod();
    *out_lod = param.x->lod();
  }

--- a/lite/kernels/fpga/mul_compute.cc
+++ b/lite/kernels/fpga/mul_compute.cc
@@ -84,7 +84,7 @@ void MulCompute::Run() {

 #ifdef FPGA_PRINT_TENSOR
  zynqmp::FullyConnectedParam& fc_param = pe_.param();
-  Debugger.get_instance().registerOutput("mul", fc_param.output);
+  Debugger::get_instance().registerOutput("mul", fc_param.output);
 #endif
 }


--- a/lite/kernels/fpga/sequence_pool_compute.cc
+++ b/lite/kernels/fpga/sequence_pool_compute.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/kernels/fpga/sequence_pool_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-void SequencePoolCompute::PrepareForRun() {}
-
-void SequencePoolCompute::Run() {
-  auto& param = Param<operators::SequencePoolParam>();
-  auto& output = param.Out;
-  const auto* din = param.X->data<float>();
-  float* dout = output->mutable_data<float>();
-  const auto pool_type = param.pool_type;
-  const auto lod = param.X->lod()[0];
-
-  int64_t width = param.X->numel() / param.X->dims()[0];
-
-  // if (pool_type == "SUM") {
-  //   lite::arm::math::seq_pool_sum(din, dout, lod, width);
-  // } else if (pool_type == "AVERAGE") {
-  //   lite::arm::math::seq_pool_average(din, dout, lod, width);
-  // } else if (pool_type == "SQRT") {
-  //   lite::arm::math::seq_pool_sqrt(din, dout, lod, width);
-  // } else if (pool_type == "MAX") {
-  //   lite::arm::math::seq_pool_max(din, dout, lod, width);
-  // } else if (pool_type == "MIN") {
-  //   lite::arm::math::seq_pool_min(din, dout, lod, width);
-  // } else if (pool_type == "FIRST") {
-  //   lite::arm::math::seq_pool_first(din, dout, lod, width);
-  // } else if (pool_type == "LAST") {
-  //   lite::arm::math::seq_pool_last(din, dout, lod, width);
-  // } else {
-  //   LOG(ERROR) << " UNKNOWN sequence pool type";
-  // }
-  int batch_size = lod.size() - 1;
-  std::vector<uint64_t> offset_new(static_cast<uint64_t>(batch_size + 1));
-  for (int i = 0; i <= batch_size; i++) {
-    offset_new[i] = i;
-  }
-  (output->mutable_lod())->push_back(offset_new);
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(sequence_pool,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::SequencePoolCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
--- a/lite/kernels/fpga/sequence_pool_compute.h
+++ b/lite/kernels/fpga/sequence_pool_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class SequencePoolCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~SequencePoolCompute() = default;
-
- private:
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle