fix:delete useless code

test=develop

fix:delete useless code
test=develop
2bc665fe · MyPandaShaoxiang · 2e2c9d4b · 2bc665fe · 2bc665fe · 2bc665fe
14 changed file
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -17,7 +17,6 @@
 #include <string>
 #include <unordered_map>

-// #include "lite/backends/fpga/lite_tensor.h"
 #include "lite/core/tensor.h"

 namespace paddle {
@@ -33,9 +32,7 @@ class Debugger {
  }

  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
-    // tensor->printScale();
-    if (op_type != "conv") {
-      // tensor->saveToFile(op_type, true);
+    if (op_type != "conv") {  // NOLINT
    }
  }

@@ -60,7 +57,6 @@ inline void chw_to_hwc(Tensor* t, float* dst) {
  if (t->dims().size() > 3) {
    width = t->dims()[3];
  }
-  // int width = t->dims()[3];
  const float* chw_data = t->data<float>();
  float* hwc_data = dst;

@@ -92,11 +88,9 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
    file_stream >> value;
    data[i] = value;
  }
-  // flush();
 }

 inline void save_float(float* data, const std::string& name, int len) {
-  // return;
  static int counter = 0;
  std::string old_string = std::to_string(counter);
  std::string new_string =
@@ -105,12 +99,8 @@ inline void save_float(float* data, const std::string& name, int len) {
  std::string file = "arm_" + new_string + name;
  counter++;

-  std::cout
-      << "-------------------------- saving file: --------------------------"
-      << file << std::endl;
  std::ofstream ofs;
  ofs.open(file);
-  // float* data = dst;
  for (int i = 0; i < len; i++) {
    float value = data[i];
    ofs << value << std::endl;
@@ -135,7 +125,6 @@ inline void save_tensor(lite::Tensor* t,
 inline void save_tensor(const lite::Tensor* t,
                        const std::string& name,
                        bool convert = true) {
-  // return;
  float* data = const_cast<float*>(t->data<float>());
  float* dst = new float[t->numel()];
  if (convert) {

--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -28,26 +28,26 @@ static int FILTER_SIZE = 2048;
 static int COLUMN = 4;

 void saveToFile(std::string name, void* data_in, int size) {
-  // std::ofstream ofs;
-  // ofs.open(name);
-
-  // int8_t* data = (int8_t*)data_in;
-  // for (int i = 0; i < size; i++) {
-  //   float value = data[i];
-  //   ofs << value << std::endl;
-  // }
-  // ofs.close();
+  std::ofstream ofs;
+  ofs.open(name);
+
+  int8_t* data = static_cast<int8_t*> data_in;
+  for (int i = 0; i < size; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
 }

 void saveFloatToFile(std::string name, float* data_in, int size) {
-  // std::ofstream ofs;
-  // ofs.open(name);
-
-  // for (int i = 0; i < size; i++) {
-  //   float value = data_in[i];
-  //   ofs << value << std::endl;
-  // }
-  // ofs.close();
+  std::ofstream ofs;
+  ofs.open(name);
+
+  for (int i = 0; i < size; i++) {
+    float value = data_in[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
 }

 void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
@@ -58,7 +58,6 @@ void set_colunm(uint32_t column) { COLUMN = column; }
 int get_filter_num_alignment() { return COLUMN * 4; }

 int calc_division_capacity(int chw) {
-  // int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
  int filter_num_alignment = get_filter_num_alignment();
  int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
  return n < FILTER_SIZE ? n : FILTER_SIZE;
@@ -222,14 +221,11 @@ int8_t* format_filter(float* data_in,
      align_to_x(num_per_div_before_alignment, filter_num_alignment);
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  // int num_after_alignment = num_per_div_after_alignment * div_num;
  int residual = num % num_per_div_before_alignment;
  int num_after_alignment = num_per_div_after_alignment *
                                ((residual == 0) ? div_num : (div_num - 1)) +
                            align_to_x(residual, filter_num_alignment);

-  // saveFloatToFile("quantize_before", data_in, data_size);
-
  int8_t* quantized_data =
      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));

@@ -237,21 +233,15 @@ int8_t* format_filter(float* data_in,
    float* filter_start = data_in + n * chw;
    float f_max = find_max(filter_start, chw);
    int8_t* quantized_start = quantized_data + n * chw;
-    // quantize(filter_start, quantized_start, chw, f_max);
    quantize(filter_start, quantized_start, chw, max);
-    // filter_max.push_back(f_max);
    filter_max.push_back(max);
  }

-  // saveToFile("chw.txt", quantized_data, data_size);
-
  int8_t* hwc_data =
      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
  convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
  fpga_free(quantized_data);

-  // saveToFile("hwc.txt", hwc_data, data_size);
-
  int8_t* temp_data = hwc_data;  // NOLINT
  int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  if (should_align_chw(chw)) {
@@ -259,7 +249,6 @@ int8_t* format_filter(float* data_in,
        fpga_malloc(num * chw_aligned * sizeof(int8_t)));
    align_chw(hwc_data, hwc_aligned_data, num, chw);

-    // saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
    temp_data = hwc_aligned_data;
    fpga_free(hwc_data);
  }
@@ -267,9 +256,6 @@ int8_t* format_filter(float* data_in,
    int filter_num_alignment = get_filter_num_alignment();
    int num_per_div_after_alignment =
        align_to_x(num_per_div_before_alignment, filter_num_alignment);
-    // int div_num =
-    //     (num + num_per_div_before_alignment - 1) /
-    //     num_per_div_before_alignment;
    int num_element = div_num * num_per_div_after_alignment * chw_aligned;
    int8_t* num_aligned_data =
        reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
@@ -279,19 +265,16 @@ int8_t* format_filter(float* data_in,
              num,
              chw_aligned);

-    // saveToFile("align_num.txt", num_aligned_data, data_size * 8);
    fpga_free(temp_data);
    temp_data = num_aligned_data;
  }
  int8_t* aligned_data =
      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
  reorder(temp_data, aligned_data, num_after_alignment, chw);
-  // saveToFile("reorder.txt", aligned_data, data_size * 8);
  fpga_free(temp_data);
  int8_t* interleaved_data =
      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
  interleave(aligned_data, interleaved_data, num_after_alignment, chw);
-  // saveToFile("interleave.txt", interleaved_data, data_size * 8);
  fpga_free(aligned_data);
  fpga_flush(interleaved_data,
             align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *

--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -45,8 +45,6 @@ int8_t* format_filter(float* data_in,

 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
-// void quantize_to_fp16(float** data_in, int num, int height, int width,
-//                       float* scale_ptr);
 size_t format_dwconv_filter(
    float** data_in, int num, int height, int width, float* scale_ptr);


--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -62,7 +62,6 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
 #ifdef ENABLE_DEBUG
-// std::cout << "fpga_malloc:" << size << std::endl;
 #endif
 #ifdef PADDLE_OS_LINUX
  void *ptr = reinterpret_cast<void *>(
@@ -250,10 +249,6 @@ int config_activation(const struct ActiveParamterArgs &args) {
  return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
 }

-// int config_power(const struct PowerArgs& args) {
-//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
-// }
-
 int config_inplace(const struct InplaceArgs &args) {
  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }

--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -14,8 +14,8 @@ limitations under the License. */

 #pragma once

-#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
-#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#ifndef PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H

 #include <stdint.h>
 #include <cstddef>
@@ -371,4 +371,4 @@ float fp16_2_fp32(int16_t fp16_num);
 }  // namespace zynqmp
 }  // namespace paddle

-#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#endif  // PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -59,11 +59,9 @@ class ConvPE : public PE {
    }

    if (param_.filter->shape().width() == 1 &&
-        param_.filter->shape().height() == 1) {
-      // use_cpu_ = true;
+        param_.filter->shape().height() == 1) {  // NOLINT
    }
-    if (!use_cpu_) {
-      // param_.filter->releaseData();
+    if (!use_cpu_) {  // NOLINT
    }
  }

@@ -94,7 +92,6 @@ class ConvPE : public PE {
    int kernel_width = param_.filter->shape().width();
    int kernel_step_h = param_.strides[0];
    int kernel_step_w = param_.strides[1];
-    // int out_channel = param_.strides[1];
    int pooled_height_ = output->shape().height();
    int pooled_width_ = out_width;
    int filter_chw = image_channels * kernel_height * kernel_width;
@@ -205,7 +202,6 @@ class ConvPE : public PE {
          }
          out_index = h * out_width * out_channel + w * out_channel + i;
          out[out_index] = sum;
-          // out_index++;
        }
      }
    }

--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -172,8 +172,6 @@ inline void format_scale_bias(Tensor* scale,
    }
  }

-  // int element_num_per_div = get_filter_num_per_div(filter, group);
-  // int scale_bias_len = align_to_x(channel / group, 8) * group;
  bias_scale::format_bias_scale_array(
      &temp_data, scale_bias_len / group, scale_bias_len);
  memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
@@ -268,8 +266,6 @@ inline void split_filter_num(const ConvParam& c_param) {
  int filter_num_alignment = filter::get_filter_num_alignment();
  int aligned_num =
      align_to_x(num / param.groups, filter_num_alignment) * param.groups;
-  // int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
-  // param.groups;
  split_num = filter::calc_split_num(aligned_num, div_capacity);

  Shape& out_shape = out->shape();
@@ -368,7 +364,6 @@ inline void split_filter_num(const ConvParam& c_param) {
    args.image.height = input->shape().height();
    args.image.pad_width = param.paddings[1];
    args.image.pad_height = param.paddings[0];
-    // dilations[0] = dilations[1] ;
    args.dilation = param.dilations[0];

    args.output.address = out_address;
@@ -424,7 +419,6 @@ inline void split_channel(const ConvParam& c_param) {
    }
    scale.flush();
    bias.flush();
-    // Shape sb_shape(N, {2 * channel});
    format_scale_bias(&scale,
                      &bias,
                      &conv_param->filter,
@@ -452,7 +446,6 @@ inline void split_channel(const ConvParam& c_param) {
    args.image.height = conv_param->input.shape().height();
    args.image.pad_width = param.paddings[1];
    args.image.pad_height = param.paddings[0];
-    // dilations[0] = dilations[1]
    args.dilation = param.dilations[0];
    args.output.address = conv_param->output.mutableData<void>();
    args.output.scale_address = conv_param->output.scale();
@@ -483,7 +476,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
  }
  size_t size = params.size();
  if (ret == 0 && size > 1) {
-    // Tensor* output = conv_params.output;
    Tensor& img = params[0]->output;
    for (int i = 0; i < 1; i++) {
      for (int i = 0; i < img.shape().numel(); i++) {

--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -62,7 +62,6 @@ class DepthwiseConvPE : public PE {
      float16* scale_data = param_.scale()->data<float16>();
      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
          FP16, param.filter->shape());
-      // memcpy(filter_data, scale_data, channel * sizeof(float16));
      memcpy(filter_data,
             scale_data,
             param.filter->shape().numel() * sizeof(float16));

--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
@@ -33,7 +33,6 @@ class ElementwiseMulPE : public PE {
    Tensor* output = param_.output;

    int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
-    // int wc_aligned =  / 32 * 32;

    Shape s(N, {wc_aligned});
    float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);

--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -38,25 +38,17 @@ struct GRUTensors {

 class GRUPE : public PE {
 public:
-  bool init() {
-    // Tensor* output = param_.output;
-    // output->setAligned(true);
-    // output->setDataLocation(Device);
-    return true;
-  }
+  bool init() { return true; }

  void apply() {
    auto hidden = param_.hidden;
-    // auto hidden_dims = hidden->dims();
    int frame_size = hidden->shape().channel();

    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
    float16* prev_hidden_data =
        prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
-    // set previous hidden data to 0;
    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));

-    // copy 2/3 weight from param.weight;
    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
@@ -77,52 +69,15 @@ class GRUPE : public PE {
    pre_out_pe_.init();
    pre_out_pe_.apply();

-    // // ============= C
-    // ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
-    // bias_add_param.inputs = {&pre_output_, &pre_input_};
-    // bias_add_param.output = &pre_input_;
-    // bias_ew_pe_.init();
-    // bias_ew_pe_.apply();
-    // // ====================
-
-    // Shape state_weight_shape(NC,{frame_size, frame_size});
-    // float* state_weight_data = state_weight_.mutableData<float>(FP32,
-    // state_weight_shape);
-    // memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
-    //   state_weight_shape.numel() * sizeof(float));
-    // FullyConnectedParam& reset_out_param = reset_out_pe_.param();
-    // reset_out_param.input = &prev_hidden;
-    // reset_out_param.output = &gate_ping;
-    // reset_out_param.filter = &state_weight_;
-
-    // // ============== unit reset;
-    // update_gate_.mutableData<void>(FP16, pre_input_shape);
-    // InputParam& relu_param = update_relu_pe_.param();
-    // relu_param.input = &tempTensor;
-    // relu_param.output = &update_gate_;
-    // update_relu_pe_.init();
-    // update_relu_pe_.apply();
-
    reset_gate_.mutableData<void>(FP16, hidden_shape);
    prev_hidden_.mutableData<void>(FP16, hidden_shape);
    reset_hidden_.mutableData<void>(FP16, hidden_shape);
-    // InputParam& reset_param = reset_relu_pe_.param();
-    // reset_param.input = &tempTensor;
-    // reset_param.output = &reset_gate_;
-    // reset_relu_pe_.init();
-    // reset_relu_pe_.apply();
-
-    // float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
-    // memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
-    // // TODO
-    // reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);

    ElementwiseMulParam& mul_param = mul_pe_.param();
    mul_param.inputs = {&reset_gate_, &prev_hidden_};
    mul_param.output = &reset_hidden_;
    mul_pe_.init();
    mul_pe_.apply();
-    // ==============
  }

  bool dispatch() { return true; }
@@ -136,23 +91,15 @@ class GRUPE : public PE {
    int stride_hidden_prev = frame_size;
    int stride_hidden = frame_size;

-    // Tensor* gate = value.gate;
-    // value.gate->saveToFile("value_input.txt");
-
    float* update_gate_data = gate_ping_.data<float>();
    float* reset_gate_data = update_gate_data + frame_size;

    for (int b = 0; b < batch_size; b++) {
-      // memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
-      // * sizeof(float));
-      // tempTensor.flush();
-
      Tensor tmp;
      Shape s(NC, {1, frame_size});
      float* tmp_data = tmp.mutableData<float>(FP32, s);

      for (int i = 0; i < frame_size; i++) {
-        // f(x) = x / (1 + abs(x))?
        update_gate_data[i] =
            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
                update_gate_data[i]);
@@ -164,17 +111,13 @@ class GRUPE : public PE {
      tmp.flush();
      reset_gate_.copyFrom(&tmp);

-      // reset_gate_.copyFrom(&tempTensor);
      Tensor* hidden_prev = value.pre_output;
      if (hidden_prev) {
-        // memcpy(prev_data, )
        // TODO(chonwhite): change to pre_out;
        prev_hidden_.copyFrom(value.pre_output);
        prev_hidden_.saveToFile("prev_.txt");
      }

-      // // 4.0 reset_date * hidden_prev;
-      // // reset_hidden_prev[i] = reset_gate[i] * prev;
      mul_pe_.dispatch();
      reset_hidden_.saveToFile("reset_hidden_.txt");
      update_gate_data += stride_update;
@@ -188,73 +131,13 @@ class GRUPE : public PE {
                        bool origin_mode,
                        GRUTensors& value,  // NOLINT
                        int frame_size,
-                        int batch_size) {
-    // int stride_update = 3 * frame_size;
-    // int stride_cell_state = 3 * frame_size;
-    // int stride_hidden_prev = frame_size;
-    // int stride_hidden = frame_size;
-
-    // Tensor* hidden = value.output_value;
-    // float* hidden_prev = nullptr;
-    // if (hidden) {
-    //   hidden_prev = hidden->data<float>();
-    // }
-
-    // float* cell_state = value.gate->data<float>() + 2 * frame_size;
-
-    // float* updata_gate = value.gate->data<float>();
-    // // float* reset_gate_data = update_gate_data + frame_size;
-
-    // float prev = 0.0f;
-    // for (int b = 0; b < batch_size; ++b) {
-    //   if (origin_mode) {
-    //     // for (int i = 0; i < frame_size; i++) {
-    //     //   float prev = 0;
-    //     //   if (hidden_prev) {
-    //     //     prev = hidden_prev[i];
-    //     //   }
-    //     //   cell_state[i] =
-    //     lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
-    //     //   hidden[i] =
-    //     //       cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
-    //     prev;
-    //     // }
-    //   } else {
-    //     for (int i = 0; i < frame_size; ++i) {
-    //       cell_state[i] =
-    //       lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
-    //       if (hidden_prev) {
-    //        prev = hidden_prev[i];
-    //       }
-    //       float hidden_value =
-    //         prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
-    //       hidden_prev[i] = hidden_value;
-    //       std::cout << "hidden_value::" << hidden_value << std::endl;
-    //     }
-    //   }
-    //   updata_gate += stride_update;
-    //   cell_state += stride_cell_state;
-    //   hidden_prev += frame_size;
-    // }
-  }
+                        int batch_size) {}

  void copy_input(GRUTensors& value) {  // NOLINT
    float max = find_max(*(value.gate));
    gate_ping_.mutableData<void>(FP32, value.gate->shape());
    gate_ping_.copyFrom(value.gate);
    // update input pointer?
-
-    // gate_.readFromFile("input/in.txt");
-    // // pre_input_.saveToFile("pppp_in.txt");
-    // gate_.scale()[0] = max / 127;
-    // gate_.scale()[1] = 127 / max;
-    // gate_.printScale("pre_input_");
-
-    // gate_.saveToFile("pre_input_.txt");
-
-    // pre_out_pe_.dispatch();
-
-    // pre_output_.saveToFile("pp_out.txt");
  }

  void GRUCOmpute(GRUTensors& value,  // NOLINT
@@ -272,25 +155,10 @@ class GRUPE : public PE {
    }

    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
-
-    // if (value.pre_output) {
-    //   // state weight;
-    //   reset_out_pe_.dispatch();
-    // }
-    // gru_unit_out_act(active_node, origin_mode, value, frame_size,
-    // batch_size);
  }

  GRUParam& param() { return param_; }

-  // Tensor* preOutput() {
-  //   return &pre_output_;
-  // }
-
-  // Tensor* gate() {
-  //   return &gate_;
-  // }
-
  Tensor* updateGate() { return &update_gate_; }

  Tensor* resetGate() { return &reset_gate_; }
@@ -302,7 +170,6 @@ class GRUPE : public PE {
  zynqmp::Tensor bias_;
  zynqmp::Tensor weight_;
  zynqmp::Tensor state_weight_;
-  // =================================
  zynqmp::Tensor update_gate_;
  zynqmp::Tensor reset_gate_;
  zynqmp::Tensor cell_state_;
@@ -310,7 +177,6 @@ class GRUPE : public PE {
  zynqmp::Tensor reset_hidden_;

  Tensor tempTensor;
-  // =================================

  ReluPE update_relu_pe_;
  ReluPE reset_relu_pe_;

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -67,9 +67,6 @@ class PoolingPE : public PE {

    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
               (k_width > 7 || k_height > 7);
-    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
-    // &&
-    //            (k_width > 255 || k_height > 255);
    use_cpu_ = param_.type == AVERAGE;
  }

@@ -79,7 +76,6 @@ class PoolingPE : public PE {
    input->syncToCPU();

    Tensor float_input;
-    // Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
@@ -192,9 +188,7 @@ class PoolingPE : public PE {

  bool dispatch() {
    if (use_cpu_) {
-      // cpu_compute();
      compute();
-      // exit(-1);
      return true;
    }
    param_.input->syncToDevice();

--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -67,15 +67,12 @@ class ScalePE : public PE {

    Tensor* scale = dw_param.scale();
    float16* scale_data = scale->mutableData<float16>(FP16, shape);
-    // memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
-    // * sizeof(float));

    Tensor* bias = dw_param.bias();
    float16* bias_data = bias->mutableData<float16>(FP16, shape);
    std::fill_n(bias_data, input->shape().channel(), 0);

    if (param_.scale->dataType() == FP32) {
-      // std::cout << "scale dataType FP32:" << std::endl;
      if (param_.bias != nullptr) {
        float* bias_data_float = param_.bias->data<float>();
        for (int i = 0; i < repeat; i++) {
@@ -127,11 +124,6 @@ class ScalePE : public PE {
      }
    }

-    // if (param_.bias != nullptr) {
-    //   memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
-    //   * sizeof(float));
-    // }
-
    dw_param.input = param_.input;
    dw_param.output = param_.output;
    dw_param.filter = &filter;
@@ -182,9 +174,6 @@ class ScalePE : public PE {
  }

  bool dispatch() {
-    // cpu_compute();
-    // return true;
-
    if (param_.scale->dataType() == FP16) {
      DepthwiseConvParam& dw_param = dw_pe_.param();
      memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
@@ -194,7 +183,6 @@ class ScalePE : public PE {
      dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];

      dw_param.quantizedFilter()->flush();
-      // apply();
    }
    param_.input->syncToDevice();
    return dw_pe_.dispatch();

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -348,19 +348,9 @@ class Tensor {
    if (placeHolder_ == nullptr) {
      return;
    }
-    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
  }

-  void printScale(std::string type) {
-    std::cout << type << " : "
-              << std::to_string(shape_->num()) + "_" +
-                     std::to_string(shape_->channel()) + "_" +
-                     std::to_string(shape_->height()) + "_" +
-                     std::to_string(shape_->width())
-              << std::endl;
-    std::cout << type << " \n";
-    printScale();
-  }
+  void printScale(std::string type) { printScale(); }

  std::string dimsFileName() {
    return std::to_string(shape_->num()) + "_" +
@@ -388,12 +378,10 @@ class Tensor {
    static int counter = 0;
    std::string npath = std::to_string(counter) + "_" + path;
    counter++;
-    std::cout << "======== saving file:" << npath << " ============\n";
    save_file_with_name(npath);
  }

  void save_file_with_name(std::string path) {
-    // return;
    invalidate();
    std::ofstream ofs;
    ofs.open(path);

--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -165,9 +165,6 @@ class TensorLite {

  TargetType target() const { return target_; }

-  // template <typename T>
-  // TensorLite Slice(int64_t begin, int64_t end) const;
-
  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }

  friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -257,8 +254,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
    int64_t base = numel() / dims_[0];

    TensorLite dst;
-    // dst.buffer_ = buffer_;
-    // dst.zynq_tensor_ = zynq_tensor_;
    dst.target_ = target_;
    auto dst_dims = dims_;
    dst_dims[0] = end - begin;
@@ -271,7 +266,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
           dst_dims.production() * sizeof(T));
    dst.ZynqTensor()->saveToFile("_slice", true);

-    // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
    return dst;
  }
 }