diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
old mode 100755
new mode 100644
index 33efaf20169dfad4035d40d3ca02ac7dc7047db3..2b9b23070616baf18f347c6b2af2d87a300d428f
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -17,7 +17,6 @@
 #include <string>
 #include <unordered_map>
 
-// #include "lite/backends/fpga/lite_tensor.h"
 #include "lite/core/tensor.h"
 
 namespace paddle {
@@ -33,9 +32,7 @@ class Debugger {
   }
 
   void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
-    // tensor->printScale();
-    if (op_type != "conv") {
-      // tensor->saveToFile(op_type, true);
+    if (op_type != "conv") {  // NOLINT
     }
   }
 
@@ -60,7 +57,6 @@ inline void chw_to_hwc(Tensor* t, float* dst) {
   if (t->dims().size() > 3) {
     width = t->dims()[3];
   }
-  // int width = t->dims()[3];
   const float* chw_data = t->data<float>();
   float* hwc_data = dst;
 
@@ -92,11 +88,9 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
     file_stream >> value;
     data[i] = value;
   }
-  // flush();
 }
 
 inline void save_float(float* data, const std::string& name, int len) {
-  // return;
   static int counter = 0;
   std::string old_string = std::to_string(counter);
   std::string new_string =
@@ -105,12 +99,8 @@ inline void save_float(float* data, const std::string& name, int len) {
   std::string file = "arm_" + new_string + name;
   counter++;
 
-  std::cout
-      << "-------------------------- saving file: --------------------------"
-      << file << std::endl;
   std::ofstream ofs;
   ofs.open(file);
-  // float* data = dst;
   for (int i = 0; i < len; i++) {
     float value = data[i];
     ofs << value << std::endl;
@@ -135,7 +125,6 @@ inline void save_tensor(lite::Tensor* t,
 inline void save_tensor(const lite::Tensor* t,
                         const std::string& name,
                         bool convert = true) {
-  // return;
   float* data = const_cast<float*>(t->data<float>());
   float* dst = new float[t->numel()];
   if (convert) {
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
index dcb7dbe8775ae66b909bfea04af8756c7f683d15..30250969b6fbe6e9e5ce7e9f96f963e8bee89224 100644
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -28,26 +28,26 @@ static int FILTER_SIZE = 2048;
 static int COLUMN = 4;
 
 void saveToFile(std::string name, void* data_in, int size) {
-  // std::ofstream ofs;
-  // ofs.open(name);
-
-  // int8_t* data = (int8_t*)data_in;
-  // for (int i = 0; i < size; i++) {
-  //   float value = data[i];
-  //   ofs << value << std::endl;
-  // }
-  // ofs.close();
+  std::ofstream ofs;
+  ofs.open(name);
+
+  int8_t* data = static_cast<int8_t*> data_in;
+  for (int i = 0; i < size; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
 }
 
 void saveFloatToFile(std::string name, float* data_in, int size) {
-  // std::ofstream ofs;
-  // ofs.open(name);
-
-  // for (int i = 0; i < size; i++) {
-  //   float value = data_in[i];
-  //   ofs << value << std::endl;
-  // }
-  // ofs.close();
+  std::ofstream ofs;
+  ofs.open(name);
+
+  for (int i = 0; i < size; i++) {
+    float value = data_in[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
 }
 
 void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
@@ -58,7 +58,6 @@ void set_colunm(uint32_t column) { COLUMN = column; }
 int get_filter_num_alignment() { return COLUMN * 4; }
 
 int calc_division_capacity(int chw) {
-  // int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
   int filter_num_alignment = get_filter_num_alignment();
   int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
   return n < FILTER_SIZE ? n : FILTER_SIZE;
@@ -222,14 +221,11 @@ int8_t* format_filter(float* data_in,
       align_to_x(num_per_div_before_alignment, filter_num_alignment);
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  // int num_after_alignment = num_per_div_after_alignment * div_num;
   int residual = num % num_per_div_before_alignment;
   int num_after_alignment = num_per_div_after_alignment *
                                 ((residual == 0) ? div_num : (div_num - 1)) +
                             align_to_x(residual, filter_num_alignment);
 
-  // saveFloatToFile("quantize_before", data_in, data_size);
-
   int8_t* quantized_data =
       reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
 
@@ -237,21 +233,15 @@ int8_t* format_filter(float* data_in,
     float* filter_start = data_in + n * chw;
     float f_max = find_max(filter_start, chw);
     int8_t* quantized_start = quantized_data + n * chw;
-    // quantize(filter_start, quantized_start, chw, f_max);
     quantize(filter_start, quantized_start, chw, max);
-    // filter_max.push_back(f_max);
     filter_max.push_back(max);
   }
 
-  // saveToFile("chw.txt", quantized_data, data_size);
-
   int8_t* hwc_data =
       reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
   convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
   fpga_free(quantized_data);
 
-  // saveToFile("hwc.txt", hwc_data, data_size);
-
   int8_t* temp_data = hwc_data;  // NOLINT
   int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
   if (should_align_chw(chw)) {
@@ -259,7 +249,6 @@ int8_t* format_filter(float* data_in,
         fpga_malloc(num * chw_aligned * sizeof(int8_t)));
     align_chw(hwc_data, hwc_aligned_data, num, chw);
 
-    // saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
     temp_data = hwc_aligned_data;
     fpga_free(hwc_data);
   }
@@ -267,9 +256,6 @@ int8_t* format_filter(float* data_in,
     int filter_num_alignment = get_filter_num_alignment();
     int num_per_div_after_alignment =
         align_to_x(num_per_div_before_alignment, filter_num_alignment);
-    // int div_num =
-    //     (num + num_per_div_before_alignment - 1) /
-    //     num_per_div_before_alignment;
     int num_element = div_num * num_per_div_after_alignment * chw_aligned;
     int8_t* num_aligned_data =
         reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
@@ -279,19 +265,16 @@ int8_t* format_filter(float* data_in,
               num,
               chw_aligned);
 
-    // saveToFile("align_num.txt", num_aligned_data, data_size * 8);
     fpga_free(temp_data);
     temp_data = num_aligned_data;
   }
   int8_t* aligned_data =
       reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
   reorder(temp_data, aligned_data, num_after_alignment, chw);
-  // saveToFile("reorder.txt", aligned_data, data_size * 8);
   fpga_free(temp_data);
   int8_t* interleaved_data =
       reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
   interleave(aligned_data, interleaved_data, num_after_alignment, chw);
-  // saveToFile("interleave.txt", interleaved_data, data_size * 8);
   fpga_free(aligned_data);
   fpga_flush(interleaved_data,
              align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h
index 90093fe05b30150d6a8f7cc21e9bf7b4eb736ff9..6e056ce0da0d8e731abf7dc418800a8e3d94969a 100644
--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -45,8 +45,6 @@ int8_t* format_filter(float* data_in,
 
 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
-// void quantize_to_fp16(float** data_in, int num, int height, int width,
-//                       float* scale_ptr);
 size_t format_dwconv_filter(
     float** data_in, int num, int height, int width, float* scale_ptr);
 
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
index 2f29e5c1b539f47f5650928e14e8180c26414860..06488469d97c077a34b3cfdb8a049c8cd61dfc93 100755
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -62,7 +62,6 @@ void reset_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
 #ifdef ENABLE_DEBUG
-// std::cout << "fpga_malloc:" << size << std::endl;
 #endif
 #ifdef PADDLE_OS_LINUX
   void *ptr = reinterpret_cast<void *>(
@@ -250,10 +249,6 @@ int config_activation(const struct ActiveParamterArgs &args) {
   return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
 }
 
-// int config_power(const struct PowerArgs& args) {
-//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
-// }
-
 int config_inplace(const struct InplaceArgs &args) {
   return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h
index e00507e3247a70caf0dd57f5ed5b20d9ebbffd77..9489c24730e52fb778ed341e0ce452b7ef86edf9 100755
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
-#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#ifndef PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
 
 #include <stdint.h>
 #include <cstddef>
@@ -371,4 +371,4 @@ float fp16_2_fp32(int16_t fp16_num);
 }  // namespace zynqmp
 }  // namespace paddle
 
-#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
+#endif  // PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
old mode 100755
new mode 100644
index ca894bdc242faf58760743a98b16a40e10a7fc82..fb15eaf77822eed076ec2001bace6871e93587ff
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -59,11 +59,9 @@ class ConvPE : public PE {
     }
 
     if (param_.filter->shape().width() == 1 &&
-        param_.filter->shape().height() == 1) {
-      // use_cpu_ = true;
+        param_.filter->shape().height() == 1) {  // NOLINT
     }
-    if (!use_cpu_) {
-      // param_.filter->releaseData();
+    if (!use_cpu_) {  // NOLINT
     }
   }
 
@@ -94,7 +92,6 @@ class ConvPE : public PE {
     int kernel_width = param_.filter->shape().width();
     int kernel_step_h = param_.strides[0];
     int kernel_step_w = param_.strides[1];
-    // int out_channel = param_.strides[1];
     int pooled_height_ = output->shape().height();
     int pooled_width_ = out_width;
     int filter_chw = image_channels * kernel_height * kernel_width;
@@ -205,7 +202,6 @@ class ConvPE : public PE {
           }
           out_index = h * out_width * out_channel + w * out_channel + i;
           out[out_index] = sum;
-          // out_index++;
         }
       }
     }
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
index 3db9662b62cce6ed33d059f60835dca25be5f60e..ecee45569c8df3d3e3926b2ca78cb49da8415aa4 100755
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -172,8 +172,6 @@ inline void format_scale_bias(Tensor* scale,
     }
   }
 
-  // int element_num_per_div = get_filter_num_per_div(filter, group);
-  // int scale_bias_len = align_to_x(channel / group, 8) * group;
   bias_scale::format_bias_scale_array(
       &temp_data, scale_bias_len / group, scale_bias_len);
   memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
@@ -268,8 +266,6 @@ inline void split_filter_num(const ConvParam& c_param) {
   int filter_num_alignment = filter::get_filter_num_alignment();
   int aligned_num =
       align_to_x(num / param.groups, filter_num_alignment) * param.groups;
-  // int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
-  // param.groups;
   split_num = filter::calc_split_num(aligned_num, div_capacity);
 
   Shape& out_shape = out->shape();
@@ -368,7 +364,6 @@ inline void split_filter_num(const ConvParam& c_param) {
     args.image.height = input->shape().height();
     args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
-    // dilations[0] = dilations[1] ;
     args.dilation = param.dilations[0];
 
     args.output.address = out_address;
@@ -424,7 +419,6 @@ inline void split_channel(const ConvParam& c_param) {
     }
     scale.flush();
     bias.flush();
-    // Shape sb_shape(N, {2 * channel});
     format_scale_bias(&scale,
                       &bias,
                       &conv_param->filter,
@@ -452,7 +446,6 @@ inline void split_channel(const ConvParam& c_param) {
     args.image.height = conv_param->input.shape().height();
     args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
-    // dilations[0] = dilations[1]
     args.dilation = param.dilations[0];
     args.output.address = conv_param->output.mutableData<void>();
     args.output.scale_address = conv_param->output.scale();
@@ -483,7 +476,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
   }
   size_t size = params.size();
   if (ret == 0 && size > 1) {
-    // Tensor* output = conv_params.output;
     Tensor& img = params[0]->output;
     for (int i = 0; i < 1; i++) {
       for (int i = 0; i < img.shape().numel(); i++) {
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
index 8b88d24918bbbecae997817e72466798c1211a18..0efca2ec2e60e8973d92f41463b0444722f2a73b 100755
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -62,7 +62,6 @@ class DepthwiseConvPE : public PE {
       float16* scale_data = param_.scale()->data<float16>();
       float16* filter_data = param.quantizedFilter()->mutableData<float16>(
           FP16, param.filter->shape());
-      // memcpy(filter_data, scale_data, channel * sizeof(float16));
       memcpy(filter_data,
              scale_data,
              param.filter->shape().numel() * sizeof(float16));
diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
index 15a3f5c98aed0d858bc40240286b42f4576a5069..0505e78b61e3b0130c876880894cec29c78406f2 100644
--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
@@ -33,7 +33,6 @@ class ElementwiseMulPE : public PE {
     Tensor* output = param_.output;
 
     int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
-    // int wc_aligned =  / 32 * 32;
 
     Shape s(N, {wc_aligned});
     float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp
old mode 100755
new mode 100644
index 2f1812707356c53e2ed846d68711b0687648a744..dcacab4eeef32b245d4126b72597b398a6627ba6
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -38,25 +38,17 @@ struct GRUTensors {
 
 class GRUPE : public PE {
  public:
-  bool init() {
-    // Tensor* output = param_.output;
-    // output->setAligned(true);
-    // output->setDataLocation(Device);
-    return true;
-  }
+  bool init() { return true; }
 
   void apply() {
     auto hidden = param_.hidden;
-    // auto hidden_dims = hidden->dims();
     int frame_size = hidden->shape().channel();
 
     zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
     float16* prev_hidden_data =
         prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
-    // set previous hidden data to 0;
     memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
 
-    // copy 2/3 weight from param.weight;
     zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
     float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
     memset(weight_data, 0, weight_shape.numel() * sizeof(float));
@@ -77,52 +69,15 @@ class GRUPE : public PE {
     pre_out_pe_.init();
     pre_out_pe_.apply();
 
-    // // ============= C
-    // ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
-    // bias_add_param.inputs = {&pre_output_, &pre_input_};
-    // bias_add_param.output = &pre_input_;
-    // bias_ew_pe_.init();
-    // bias_ew_pe_.apply();
-    // // ====================
-
-    // Shape state_weight_shape(NC,{frame_size, frame_size});
-    // float* state_weight_data = state_weight_.mutableData<float>(FP32,
-    // state_weight_shape);
-    // memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
-    //   state_weight_shape.numel() * sizeof(float));
-    // FullyConnectedParam& reset_out_param = reset_out_pe_.param();
-    // reset_out_param.input = &prev_hidden;
-    // reset_out_param.output = &gate_ping;
-    // reset_out_param.filter = &state_weight_;
-
-    // // ============== unit reset;
-    // update_gate_.mutableData<void>(FP16, pre_input_shape);
-    // InputParam& relu_param = update_relu_pe_.param();
-    // relu_param.input = &tempTensor;
-    // relu_param.output = &update_gate_;
-    // update_relu_pe_.init();
-    // update_relu_pe_.apply();
-
     reset_gate_.mutableData<void>(FP16, hidden_shape);
     prev_hidden_.mutableData<void>(FP16, hidden_shape);
     reset_hidden_.mutableData<void>(FP16, hidden_shape);
-    // InputParam& reset_param = reset_relu_pe_.param();
-    // reset_param.input = &tempTensor;
-    // reset_param.output = &reset_gate_;
-    // reset_relu_pe_.init();
-    // reset_relu_pe_.apply();
-
-    // float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
-    // memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
-    // // TODO
-    // reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
 
     ElementwiseMulParam& mul_param = mul_pe_.param();
     mul_param.inputs = {&reset_gate_, &prev_hidden_};
     mul_param.output = &reset_hidden_;
     mul_pe_.init();
     mul_pe_.apply();
-    // ==============
   }
 
   bool dispatch() { return true; }
@@ -136,23 +91,15 @@ class GRUPE : public PE {
     int stride_hidden_prev = frame_size;
     int stride_hidden = frame_size;
 
-    // Tensor* gate = value.gate;
-    // value.gate->saveToFile("value_input.txt");
-
     float* update_gate_data = gate_ping_.data<float>();
     float* reset_gate_data = update_gate_data + frame_size;
 
     for (int b = 0; b < batch_size; b++) {
-      // memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
-      // * sizeof(float));
-      // tempTensor.flush();
-
       Tensor tmp;
       Shape s(NC, {1, frame_size});
       float* tmp_data = tmp.mutableData<float>(FP32, s);
 
       for (int i = 0; i < frame_size; i++) {
-        // f(x) = x / (1 + abs(x))?
         update_gate_data[i] =
             lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
                 update_gate_data[i]);
@@ -164,17 +111,13 @@ class GRUPE : public PE {
       tmp.flush();
       reset_gate_.copyFrom(&tmp);
 
-      // reset_gate_.copyFrom(&tempTensor);
       Tensor* hidden_prev = value.pre_output;
       if (hidden_prev) {
-        // memcpy(prev_data, )
         // TODO(chonwhite): change to pre_out;
         prev_hidden_.copyFrom(value.pre_output);
         prev_hidden_.saveToFile("prev_.txt");
       }
 
-      // // 4.0 reset_date * hidden_prev;
-      // // reset_hidden_prev[i] = reset_gate[i] * prev;
       mul_pe_.dispatch();
       reset_hidden_.saveToFile("reset_hidden_.txt");
       update_gate_data += stride_update;
@@ -188,73 +131,13 @@ class GRUPE : public PE {
                         bool origin_mode,
                         GRUTensors& value,  // NOLINT
                         int frame_size,
-                        int batch_size) {
-    // int stride_update = 3 * frame_size;
-    // int stride_cell_state = 3 * frame_size;
-    // int stride_hidden_prev = frame_size;
-    // int stride_hidden = frame_size;
-
-    // Tensor* hidden = value.output_value;
-    // float* hidden_prev = nullptr;
-    // if (hidden) {
-    //   hidden_prev = hidden->data<float>();
-    // }
-
-    // float* cell_state = value.gate->data<float>() + 2 * frame_size;
-
-    // float* updata_gate = value.gate->data<float>();
-    // // float* reset_gate_data = update_gate_data + frame_size;
-
-    // float prev = 0.0f;
-    // for (int b = 0; b < batch_size; ++b) {
-    //   if (origin_mode) {
-    //     // for (int i = 0; i < frame_size; i++) {
-    //     //   float prev = 0;
-    //     //   if (hidden_prev) {
-    //     //     prev = hidden_prev[i];
-    //     //   }
-    //     //   cell_state[i] =
-    //     lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
-    //     //   hidden[i] =
-    //     //       cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
-    //     prev;
-    //     // }
-    //   } else {
-    //     for (int i = 0; i < frame_size; ++i) {
-    //       cell_state[i] =
-    //       lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
-    //       if (hidden_prev) {
-    //        prev = hidden_prev[i];
-    //       }
-    //       float hidden_value =
-    //         prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
-    //       hidden_prev[i] = hidden_value;
-    //       std::cout << "hidden_value::" << hidden_value << std::endl;
-    //     }
-    //   }
-    //   updata_gate += stride_update;
-    //   cell_state += stride_cell_state;
-    //   hidden_prev += frame_size;
-    // }
-  }
+                        int batch_size) {}
 
   void copy_input(GRUTensors& value) {  // NOLINT
     float max = find_max(*(value.gate));
     gate_ping_.mutableData<void>(FP32, value.gate->shape());
     gate_ping_.copyFrom(value.gate);
     // update input pointer?
-
-    // gate_.readFromFile("input/in.txt");
-    // // pre_input_.saveToFile("pppp_in.txt");
-    // gate_.scale()[0] = max / 127;
-    // gate_.scale()[1] = 127 / max;
-    // gate_.printScale("pre_input_");
-
-    // gate_.saveToFile("pre_input_.txt");
-
-    // pre_out_pe_.dispatch();
-
-    // pre_output_.saveToFile("pp_out.txt");
   }
 
   void GRUCOmpute(GRUTensors& value,  // NOLINT
@@ -272,25 +155,10 @@ class GRUPE : public PE {
     }
 
     gru_unit_reset_act(active_gate, value, frame_size, batch_size);
-
-    // if (value.pre_output) {
-    //   // state weight;
-    //   reset_out_pe_.dispatch();
-    // }
-    // gru_unit_out_act(active_node, origin_mode, value, frame_size,
-    // batch_size);
   }
 
   GRUParam& param() { return param_; }
 
-  // Tensor* preOutput() {
-  //   return &pre_output_;
-  // }
-
-  // Tensor* gate() {
-  //   return &gate_;
-  // }
-
   Tensor* updateGate() { return &update_gate_; }
 
   Tensor* resetGate() { return &reset_gate_; }
@@ -302,7 +170,6 @@ class GRUPE : public PE {
   zynqmp::Tensor bias_;
   zynqmp::Tensor weight_;
   zynqmp::Tensor state_weight_;
-  // =================================
   zynqmp::Tensor update_gate_;
   zynqmp::Tensor reset_gate_;
   zynqmp::Tensor cell_state_;
@@ -310,7 +177,6 @@ class GRUPE : public PE {
   zynqmp::Tensor reset_hidden_;
 
   Tensor tempTensor;
-  // =================================
 
   ReluPE update_relu_pe_;
   ReluPE reset_relu_pe_;
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
index 386a470975261871137429f03d7c76b43aedb94b..a8725b51a690e0e134785fcfdb3dd70edeffd441 100755
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -67,9 +67,6 @@ class PoolingPE : public PE {
 
     use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
                (k_width > 7 || k_height > 7);
-    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
-    // &&
-    //            (k_width > 255 || k_height > 255);
     use_cpu_ = param_.type == AVERAGE;
   }
 
@@ -79,7 +76,6 @@ class PoolingPE : public PE {
     input->syncToCPU();
 
     Tensor float_input;
-    // Tensor float_output;
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
@@ -192,9 +188,7 @@ class PoolingPE : public PE {
 
   bool dispatch() {
     if (use_cpu_) {
-      // cpu_compute();
       compute();
-      // exit(-1);
       return true;
     }
     param_.input->syncToDevice();
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
old mode 100755
new mode 100644
index 91f698ba514b949a4d22416791ed3993c1df737f..cc89ac943f90cb20062a3d6ef9a46b705193ad04
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -67,15 +67,12 @@ class ScalePE : public PE {
 
     Tensor* scale = dw_param.scale();
     float16* scale_data = scale->mutableData<float16>(FP16, shape);
-    // memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
-    // * sizeof(float));
 
     Tensor* bias = dw_param.bias();
     float16* bias_data = bias->mutableData<float16>(FP16, shape);
     std::fill_n(bias_data, input->shape().channel(), 0);
 
     if (param_.scale->dataType() == FP32) {
-      // std::cout << "scale dataType FP32:" << std::endl;
       if (param_.bias != nullptr) {
         float* bias_data_float = param_.bias->data<float>();
         for (int i = 0; i < repeat; i++) {
@@ -127,11 +124,6 @@ class ScalePE : public PE {
       }
     }
 
-    // if (param_.bias != nullptr) {
-    //   memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
-    //   * sizeof(float));
-    // }
-
     dw_param.input = param_.input;
     dw_param.output = param_.output;
     dw_param.filter = &filter;
@@ -182,9 +174,6 @@ class ScalePE : public PE {
   }
 
   bool dispatch() {
-    // cpu_compute();
-    // return true;
-
     if (param_.scale->dataType() == FP16) {
       DepthwiseConvParam& dw_param = dw_pe_.param();
       memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
@@ -194,7 +183,6 @@ class ScalePE : public PE {
       dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
 
       dw_param.quantizedFilter()->flush();
-      // apply();
     }
     param_.input->syncToDevice();
     return dw_pe_.dispatch();
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
old mode 100755
new mode 100644
index 047498eed009dded5ce398ddabc2079b62d937df..f1b07d02622fad32e99205667424a4cb3c9fb46d
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -348,19 +348,9 @@ class Tensor {
     if (placeHolder_ == nullptr) {
       return;
     }
-    std::cout << scale()[0] << " , " << scale()[1] << std::endl;
   }
 
-  void printScale(std::string type) {
-    std::cout << type << " : "
-              << std::to_string(shape_->num()) + "_" +
-                     std::to_string(shape_->channel()) + "_" +
-                     std::to_string(shape_->height()) + "_" +
-                     std::to_string(shape_->width())
-              << std::endl;
-    std::cout << type << " \n";
-    printScale();
-  }
+  void printScale(std::string type) { printScale(); }
 
   std::string dimsFileName() {
     return std::to_string(shape_->num()) + "_" +
@@ -388,12 +378,10 @@ class Tensor {
     static int counter = 0;
     std::string npath = std::to_string(counter) + "_" + path;
     counter++;
-    std::cout << "======== saving file:" << npath << " ============\n";
     save_file_with_name(npath);
   }
 
   void save_file_with_name(std::string path) {
-    // return;
     invalidate();
     std::ofstream ofs;
     ofs.open(path);
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index ccf3628ecf16c91b722380ad6bfd11b8e89b1879..311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -165,9 +165,6 @@ class TensorLite {
 
   TargetType target() const { return target_; }
 
-  // template <typename T>
-  // TensorLite Slice(int64_t begin, int64_t end) const;
-
   zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
 
   friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
@@ -257,8 +254,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
     int64_t base = numel() / dims_[0];
 
     TensorLite dst;
-    // dst.buffer_ = buffer_;
-    // dst.zynq_tensor_ = zynq_tensor_;
     dst.target_ = target_;
     auto dst_dims = dims_;
     dst_dims[0] = end - begin;
@@ -271,7 +266,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
            dst_dims.production() * sizeof(T));
     dst.ZynqTensor()->saveToFile("_slice", true);
 
-    // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
     return dst;
   }
 }