mobilenet 1&2 works

46f36ceb · chonwhite · d3d793c7 · 46f36ceb · 46f36ceb · 46f36ceb
15 changed file
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -19,12 +19,13 @@
 #include <string>
 #include <unordered_map>

+#include "lite/core/program.h"
 #include "lite/core/tensor.h"

 namespace paddle {
 namespace lite {

-// #define FPGA_PRINT_TENSOR
+#define FPGA_PRINT_TENSOR

 class Debugger {
 public:
@@ -35,7 +36,7 @@ class Debugger {

  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
    if (op_config[op_type]) {
-      tensor->saveToFile(op_type, true);
+      // tensor->saveToFile(op_type, true);
    }
  }


--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -72,18 +72,110 @@ class ConvPE : public PE {
    }
    if (param_.filter->shape().width() == 1 &&
        param_.filter->shape().num() % 16 != 0) {
-      use_cpu_ = true;
+      // use_cpu_ = true;
    }
    if (!use_cpu_) {
      // param_.filter->releaseData();
    }
+  }
+
+  void cpu_conv_half_hwc() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+
+    Shape& input_shape = input->shape();
+    Shape& out_shape = output->shape();
+
+    int image_height = input_shape.height();
+    int image_width = input_shape.width();
+    int image_channels = input_shape.channel();
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[0];
+    int kernel_height = param_.filter->shape().height();
+    int kernel_width = param_.filter->shape().width();
+    int kernel_step_h = param_.strides[0];
+    int kernel_step_w = param_.strides[1];
+    int dilation_rate = 1;
+    int out_channel = out_shape.channel();
+    int pooled_height_ = out_shape.height();
+    int pooled_width_ = out_shape.width();
+    int filter_chw = image_channels * kernel_height * kernel_width;
+
+    int kernel_rw = kernel_width + (dilation_rate - 1) * (kernel_width - 1);
+    int kernel_rh = kernel_height + (dilation_rate - 1) * (kernel_height - 1);
+
+    float* weight = param_.filter->data<float>();
+
+    Tensor float_input;
+    Tensor float_output;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+
+    float* out = float_output.mutableData<float>(FP32, output->shape());
+
+    for (int ph = 0; ph < pooled_height_; ph++) {
+      for (int pw = 0; pw < pooled_width_; pw++) {
+        int hstart = ph * kernel_step_h - image_pad_h;
+        int wstart = pw * kernel_step_w - image_pad_w;
+        int hend = std::min(hstart + kernel_rh, (int)image_height);
+        int wend = std::min(wstart + kernel_rw, (int)image_width);
+
+        int hstart_plus =
+            dilation_rate * ceil(float(image_pad_h - ph * kernel_step_h) /
+                                 float(dilation_rate)) -
+            image_pad_h + ph * kernel_step_h;
+        int wstart_plus =
+            dilation_rate * ceil(float(image_pad_w - pw * kernel_step_w) /
+                                 float(dilation_rate)) -
+            image_pad_w + pw * kernel_step_w;
+
+        int hstart_ = hstart < 0 ? hstart_plus : hstart;
+        int wstart_ = wstart < 0 ? wstart_plus : wstart;
+
+        for (int oc = 0; oc < out_channel; oc++) {
+          float sum = 0.0f;
+          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
+          for (int c = 0; c < image_channels; c++) {
+            for (int h = hstart_; h < hend; h += dilation_rate) {
+              int hi = 0;
+              if (hstart < 0) {
+                hi = (kernel_rh - (hend - h)) / dilation_rate;
+              } else {
+                hi = (h - hstart_) / dilation_rate;
+              }

-    // exit(-1);
+              for (int w = wstart_; w < wend; w += dilation_rate) {
+                int wi = 0;
+                if (wstart < 0) {
+                  wi = (kernel_rw - (wend - w)) / dilation_rate;
+                } else {
+                  wi = (w - wstart_) / dilation_rate;
                }
+
+                const int index = (h * image_width + w) * image_channels + c;
+                int weight_index = oc * filter_chw +
+                                   kernel_width * kernel_height * c +
+                                   kernel_width * hi + wi;
+                float value = image_addr[index] * weight[weight_index];
+                sum += value;
+              }
+            }
+          }
+          float s = param_.scale()->data<float>()[oc];
+          float b = param_.bias()->data<float>()[oc];
+          out[pool_index] = sum * s + b;
+        }
+      }
+    }
+    float_output.saveToFile("fo", true);
+    exit(-1);
+  }
+
  void cpu_compute() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
-    input->syncToCPU();
+    // input->saveToFile("input", true);
+    // input->syncToCPU();

    Tensor float_input;
    Tensor float_output;
@@ -117,24 +209,39 @@ class ConvPE : public PE {
        for (int j = 0; j < in_channel; j++) {
          sum += mi[j];
        }
-        sum *= param_.scale()->data<float>()[i];
-        sum += param_.bias()->data<float>()[i];
-        out[i * wh + k] = sum;
-        max = std::max(max, std::abs(sum));
+        float fv = sum;
+        float s = param_.scale()->data<float>()[i];
+        float b = param_.bias()->data<float>()[i];
+
+        fv *= s;
+        fv += b;
+
+        // std::cout << "\n" << fv << " = " << sum << " x " << s << " + " << b
+        // << std::endl;
+
+        out[i * wh + k] = fv;
+        max = std::max(max, std::abs(fv));
      }
    }
    delete[] mi;
+    param_.bias()->saveToFile("bias", true);
+
+    exit(-1);
+
    float_output.flush();
+    float_output.saveToFile("float_output", true);
    output->copyFrom(&float_output);
+    output->invalidate();
    output->scale()[0] = max / 127.0;
    output->scale()[1] = 127.0 / max;
    // output->saveToFile("cpu", true);
  }

  bool dispatch() {
-    fpga_reset();
+    // fpga_reset();
    if (use_cpu_) {
-      cpu_compute();
+      // cpu_compute();
+      cpu_conv_half_hwc();
      return true;
    }


--- a/lite/backends/fpga/KD/pes/softmax_pe.cpp
+++ b/lite/backends/fpga/KD/pes/softmax_pe.cpp
@@ -59,6 +59,7 @@ static void softmax(Tensor *X, Tensor *Y) {
  int batch_size = X->shape().num();
  int num_classes = dims[X->shape().dimSize() - 1];
  int channels = X->shape().numel() / batch_size / num_classes;
+
  float *x = X->data<float>();
  float *y = Y->mutableData<float>();

@@ -140,12 +141,23 @@ bool SoftmaxPE::init() {
 bool SoftmaxPE::dispatch() {
  Tensor *input = param_.input;
  Tensor *output = param_.output;
-  input->syncToCPU();

  Tensor float_input;
  Tensor float_output;
  float_input.mutableData<float>(DataType::FP32, input->shape());
-  float_input.copyFrom(input);
+  // input->saveToFile("in", true);
+  // input->syncToDevice();
+  // float_input.copyFrom(input);
+
+  input->syncToCPU();
+  float16 *in_data = input->data<float16>();
+  float *f_data = float_input.data<float>();
+  for (int i = 0; i < input->shape().channel(); i++) {
+    f_data[i] = half_to_float(in_data[i]);
+  }
+
+  // float_input.invalidate();
+  // float_input.saveToFile("fin", true);

  float *out_data =
      float_output.mutableData<float>(DataType::FP32, input->shape());

--- a/lite/backends/fpga/KD/pes/yolobox_pe.hpp
+++ b/lite/backends/fpga/KD/pes/yolobox_pe.hpp
@@ -20,30 +20,40 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {

-
-float sigmoid(float x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-inline void GetYoloBox(float* box, const float* x, const int* anchors, int w,
-                                  int h, int an_idx, int grid_size,
-                                  int input_size, int index, 
-                                  int img_height, int img_width) {
-  box[0] = (w + sigmoid(x[index])) * img_width * 1.0f/ grid_size;
+float sigmoid(float x) { return 1.0 / (1.0 + std::exp(-x)); }
+
+inline void GetYoloBox(float* box,
+                       const float* x,
+                       const int* anchors,
+                       int w,
+                       int h,
+                       int an_idx,
+                       int grid_size,
+                       int input_size,
+                       int index,
+                       int img_height,
+                       int img_width) {
+  box[0] = (w + sigmoid(x[index])) * img_width * 1.0f / grid_size;
  box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
-  box[2] = std::exp(x[index + 2 ]) * anchors[2 * an_idx] * img_width * 1.0f/
+  box[2] = std::exp(x[index + 2]) * anchors[2 * an_idx] * img_width * 1.0f /
           input_size;
-  box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] *
-           img_height * 1.0f / input_size;
+  box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] * img_height *
+           1.0f / input_size;
 }

-inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
-                                    int an_num, int an_stride, int stride,
+inline int GetEntryIndex(int batch,
+                         int an_idx,
+                         int hw_idx,
+                         int an_num,
+                         int an_stride,
+                         int stride,
                         int entry) {
  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
 }

-inline void CalcDetectionBox(float* boxes, float* box, const int box_idx,
+inline void CalcDetectionBox(float* boxes,
+                             float* box,
+                             const int box_idx,
                             const int img_height,
                             const int img_width) {
  boxes[box_idx] = box[0] - box[2] / 2;
@@ -52,19 +62,19 @@ inline void CalcDetectionBox(float* boxes, float* box, const int box_idx,
  boxes[box_idx + 3] = box[1] + box[3] / 2;

  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
-  boxes[box_idx + 1] =
-      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                           ? boxes[box_idx + 2]
-                           : (img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                           ? boxes[box_idx + 3]
+  boxes[box_idx + 1] = boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
+  boxes[box_idx + 2] =
+      boxes[box_idx + 2] < img_width - 1 ? boxes[box_idx + 2] : (img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 ? boxes[box_idx + 3]
                                                           : (img_height - 1);
 }

-inline void CalcLabelScore(float* scores, const float* input,
-                                      const int label_idx, const int score_idx,
-                                      const int class_num, const float conf) {
+inline void CalcLabelScore(float* scores,
+                           const float* input,
+                           const int label_idx,
+                           const int score_idx,
+                           const int class_num,
+                           const float conf) {
  for (int i = 0; i < class_num; i++) {
    scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
    // std::cout << scores[score_idx + i] << " ";
@@ -72,7 +82,6 @@ inline void CalcLabelScore(float* scores, const float* input,
  // std::cout << std::endl;
 }

-
 class YoloBoxPE : public PE {
 public:
  bool init() {
@@ -93,7 +102,6 @@ class YoloBoxPE : public PE {
    float conf_thresh = param_.confThresh;
    int downsample_ratio = param_.downsampleRatio;

-
    const int num = input->shape().num();
    const int height = input->shape().height();
    const int width = input->shape().width();
@@ -139,11 +147,13 @@ class YoloBoxPE : public PE {
    Tensor scores_float;

    boxes_float.setDataLocation(CPU);
-    float* boxes_float_data = boxes_float.mutableData<float>(FP32, boxes->shape());
+    float* boxes_float_data =
+        boxes_float.mutableData<float>(FP32, boxes->shape());
    memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));

    scores_float.setDataLocation(CPU);
-    float* scores_float_data = scores_float.mutableData<float>(FP32, scores->shape());
+    float* scores_float_data =
+        scores_float.mutableData<float>(FP32, scores->shape());
    memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));

    // float* boxes_data = boxes->mutableData<float>();
@@ -158,15 +168,16 @@ class YoloBoxPE : public PE {
    // int img_width = imgsize_data[2 * i + 1];
    int img_height = imgsize_data[0];
    int img_width = imgsize_data[1];
-    std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width << std::endl;
+    std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width
+              << std::endl;

    int channel = input_float.shape().channel();
    int count = 0;
    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width ; w++) {
+      for (int w = 0; w < width; w++) {
        for (int n = 0; n < an_num; n++) {
-     
-          int obj_idx = channel * width * h + channel * w + n * (5 + class_num) + 4;
+          int obj_idx =
+              channel * width * h + channel * w + n * (5 + class_num) + 4;
          // std::cout << obj_idx << " ";
          float conf = sigmoid(input_data[obj_idx]);
          if (conf < conf_thresh) {
@@ -174,16 +185,34 @@ class YoloBoxPE : public PE {
            continue;
          }

-          int box_idx = channel * width * h + channel * w + n * (5 + class_num) + 0;
-          GetYoloBox(box, input_data, anchors_data, w, h, n, height, input_size,
-                        box_idx, img_height, img_width);
+          int box_idx =
+              channel * width * h + channel * w + n * (5 + class_num) + 0;
+          GetYoloBox(box,
+                     input_data,
+                     anchors_data,
+                     w,
+                     h,
+                     n,
+                     height,
+                     input_size,
+                     box_idx,
+                     img_height,
+                     img_width);

          box_idx = h * an_num * 4 * width + an_num * 4 * w + n * 4;
-          CalcDetectionBox(boxes_float_data, box, box_idx, img_height,img_width);
-
-          int label_idx = channel * width * h + channel * w + n * (5 + class_num) + 5;
-          int score_idx = h * an_num * class_num * width + an_num * class_num * w + n * class_num;
-          CalcLabelScore(scores_float_data, input_data, label_idx, score_idx, class_num, conf);
+          CalcDetectionBox(
+              boxes_float_data, box, box_idx, img_height, img_width);
+
+          int label_idx =
+              channel * width * h + channel * w + n * (5 + class_num) + 5;
+          int score_idx = h * an_num * class_num * width +
+                          an_num * class_num * w + n * class_num;
+          CalcLabelScore(scores_float_data,
+                         input_data,
+                         label_idx,
+                         score_idx,
+                         class_num,
+                         conf);
        }
      }
    }
@@ -199,7 +228,6 @@ class YoloBoxPE : public PE {

 private:
  YoloBoxParam param_;
-
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -70,6 +70,7 @@ class PlaceHolder {
  explicit PlaceHolder(size_t size) {
    size_ = size;
    data_ = fpga_malloc(size_);
+    // memset(data_, 0, size);
  }

  void* data() { return data_; }
@@ -80,7 +81,7 @@ class PlaceHolder {

  ~PlaceHolder() { fpga_free(data_); }

-  float scale_[2];
+  float scale_[2] = {0};

 private:
  void* data_ = nullptr;
@@ -409,12 +410,14 @@ class Tensor {
      if (i < 10) {
        std::cout << value << ",";
      }
+
      //   if (i > 1000) {
      //       break;
      //   }
      ofs << value << std::endl;
    }
-    usleep(30000);
+    std::cout << std::endl;
+    // usleep(30000);
    ofs.close();
  }


--- a/lite/backends/fpga/monitor.hpp
+++ b/lite/backends/fpga/monitor.hpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
+#include "lite/core/program.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+class Monitor {
+ public:
+  static Monitor& get_instance() {
+    static Monitor s_instance;
+    return s_instance;
+  }
+
+  void inferStart() {}
+
+  void preRun(Instruction& inst) {
+    VLOG(4)  << "Running op:" << const_cast<OpLite*>(inst.op())->Type();
+  }
+
+  void postRun(Instruction& inst) {}
+
+  void inferEnd() {}
+
+ private:
+};
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -134,7 +134,6 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // Start from inputs of the graph, those should have place set.
  std::list<Node*> nodes;
  for (auto& node : graph->StmtTopologicalOrder()) {
-
    // if (node->IsStmt()) {
    //     auto& s = node->AsStmt();
    //     std::cout << "type_precision type:" << s.op_type() << std::endl;

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -25,6 +25,10 @@
 #include "lite/core/profile/precision_profiler.h"
 #endif

+#ifdef LITE_WITH_FPGA
+#include "lite/backends/fpga/monitor.hpp"
+#endif
+
 namespace paddle {
 namespace lite {

@@ -151,23 +155,41 @@ void RuntimeProgram::Run() {
      inst_precision_profiler.GetSummaryHeader();
 #endif

+#ifdef LITE_WITH_FPGA
+  Monitor& monitor = Monitor::get_instance();
+  monitor.inferStart();
+#endif
+
  for (auto& inst : instructions_) {
+#ifdef LITE_WITH_FPGA
+    monitor.preRun(inst);
+#endif
+
 #ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
 #endif
+
 #ifdef LITE_WITH_CUDA
    if (inst.need_sync()) {
      inst.Sync();
    }
 #endif
    inst.Run();
+
+#ifdef LITE_WITH_FPGA
+    monitor.postRun(inst);
+#endif
+
 #ifdef LITE_WITH_PRECISION_PROFILE
-#ifndef LITE_WITH_FPGA
    precision_profiler_summary +=
        inst_precision_profiler.GetInstPrecision(&inst);
-#endif
 #endif  // LITE_WITH_PRECISION_PROFILE
  }
+
+#ifdef LITE_WITH_FPGA
+  monitor.inferEnd();
+#endif
+
 #ifdef LITE_WITH_PROFILE
  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
 #endif

--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -25,12 +25,46 @@ namespace kernels {
 namespace fpga {

 using float16 = zynqmp::float16;
+using lite_api::ActivationType;

 void ConvCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  param.output->mutable_data<float16>();
  int pad_h = (*param.paddings)[0];
  int pad_w = (*param.paddings)[2];
+
+  zynqmp::ActiveType active_type = zynqmp::TYPE_NONE;
+  float leaky_relu_factor = 0;
+
+  switch (param.activation_param.active_type) {
+    case ActivationType::kIndentity:
+      active_type = zynqmp::TYPE_NONE;
+      break;
+    case ActivationType::kRelu:
+      active_type = zynqmp::TYPE_RELU;
+      break;
+    case ActivationType::kRelu6:
+      active_type = zynqmp::TYPE_RELU6;
+      break;
+    case ActivationType::kPRelu:
+    case ActivationType::kLeakyRelu:
+      active_type = zynqmp::TYPE_LEAKY_RELU;
+      leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
+      break;
+    case ActivationType::kSigmoid:
+      active_type = zynqmp::TYPE_SIGMOID;
+      break;
+    case ActivationType::kTanh:
+    case ActivationType::kSwish:
+    case ActivationType::kExp:
+    case ActivationType::kAbs:
+    case ActivationType::kHardSwish:
+    case ActivationType::kReciprocal:
+    default:
+      throw("not supported activation");
+      break;
+  }
+
  // ====================================================
  if (param.x->ZynqTensor()->shape().channel() != 1 &&
      param.groups == param.x->ZynqTensor()->shape().channel()) {
@@ -45,17 +79,12 @@ void ConvCompute::PrepareForRun() {
    conv_param.paddings = std::vector<int>({pad_h, pad_w});
    conv_param.dilations = *param.dilations;
    fill_scale_bias_const(&conv_param);
+    if (param.bias != nullptr) {
      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-
-    if (param.fuse_relu) {
-      conv_param.activeParam.type = zynqmp::TYPE_RELU;
    }

-    if (param.activation_param.Leaky_relu_alpha > 0.001) {
-      conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
-      conv_param.activeParam.leaky_relu_factor =
-          param.activation_param.Leaky_relu_alpha;
-    }
+    conv_param.activeParam.type = active_type;
+    conv_param.activeParam.leaky_relu_factor = leaky_relu_factor;

    dw_conv_pe_.init();
    dw_conv_pe_.apply();
@@ -74,21 +103,12 @@ void ConvCompute::PrepareForRun() {
      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
    }

-    if (param.fuse_relu) {
-      conv_param.activeParam.type = zynqmp::TYPE_RELU;
-    }
-
-    if (param.activation_param.Leaky_relu_alpha > 0.001) {
-      conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
-      conv_param.activeParam.leaky_relu_factor =
-          param.activation_param.Leaky_relu_alpha;
-    }
+    conv_param.activeParam.type = active_type;
+    conv_param.activeParam.leaky_relu_factor = leaky_relu_factor;

    conv_pe_.init();
    conv_pe_.apply();
  }
-  // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha
-  // << std::endl;
 }

 void ConvCompute::Run() {

--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -227,7 +227,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
      SliceOneClass<T>(scores, c, &score_slice);
      SliceOneClass<T>(bboxes, c, &bbox_slice);
    }
-    NMSFast(bboxes,// TODO
+    NMSFast(bboxes,  // TODO
            score_slice,
            score_threshold,
            nms_threshold,

--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/fpga/reshape_compute.h"
 #include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/fpga/reshape_compute.h"
 #include "lite/operators/reshape_op.h"

 namespace paddle {
@@ -48,21 +50,31 @@ void FlattenCompute::Run() {
 #endif
 }

-void ReshapeCompute::Run() {
+void ReshapeCompute::PrepareForRun() {
  auto& param = Param<operators::ReshapeParam>();
  auto x = param.x;
  auto output = param.output;
  auto output_dims = output->dims();

-  x->ZynqTensor()->unalignImage();
-
-  // x->ZynqTensor()->saveToFile("ri", true);
-
  output->Resize(output_dims);
  output->mutable_data<float16>();
+}
+
+void ReshapeCompute::Run() {
+  auto& param = Param<operators::ReshapeParam>();
+  auto x = param.x;
+  auto output = param.output;
+  // auto output_dims = output->dims();
+
+  // x->ZynqTensor()->invalidate();// TODO
+  x->ZynqTensor()->unalignImage();
+  x->ZynqTensor()->flush();
+
+  // output->Resize(output_dims);
+  // output->mutable_data<float16>();

  if (param.inplace) {
-    output->ShareDataWith(*x);
+    // output->ShareDataWith(*x);
  } else {
    // output->CopyDataFrom(*x);
  }
@@ -70,7 +82,7 @@ void ReshapeCompute::Run() {
  output->ZynqTensor()->copyFrom(x->ZynqTensor());
  // output->ZynqTensor()->saveToFile("ro", true);
  output->ZynqTensor()->flush();
-  output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
+// output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());

 #ifdef FPGA_PRINT_TENSOR
  Debugger::get_instance().registerOutput("reshape", output->ZynqTensor());

--- a/lite/kernels/fpga/reshape_compute.h
+++ b/lite/kernels/fpga/reshape_compute.h
@@ -25,6 +25,7 @@ namespace fpga {
 class ReshapeCompute
    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
 public:
+  void PrepareForRun() override;
  void Run() override;

  virtual ~ReshapeCompute() = default;
@@ -41,6 +42,7 @@ class FlattenCompute
 class ReshapeComputeFpgaToHost
    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
 public:
+  void PrepareForRun() override;
  void Run() override;

  virtual ~ReshapeComputeFpgaToHost() = default;

--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
@@ -14,6 +14,7 @@

 #include "lite/kernels/fpga/softmax_compute.h"
 #include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/debugger.hpp"

 namespace paddle {
 namespace lite {
@@ -36,11 +37,10 @@ void SoftmaxCompute::PrepareForRun() {

 void SoftmaxCompute::Run() {
  zynqmp::SoftmaxParam& softmax_param = pe_.param();
-  // softmax_param.input->saveToFile("softmax_in", true);
  pe_.dispatch();

-  softmax_param.output->flush();
-// softmax_param.output->saveToFile("softmax", true);
+//   softmax_param.output->flush();
+// // softmax_param.output->saveToFile("softmax", true);
 #ifdef FPGA_PRINT_TENSOR
  Debugger::get_instance().registerOutput("softmax", softmax_param.output);
 #endif

--- a/lite/kernels/fpga/yolo_box_compute.cc
+++ b/lite/kernels/fpga/yolo_box_compute.cc
@@ -29,7 +29,6 @@ void YoloBoxCompute::PrepareForRun() {
  lite::Tensor* Boxes = param.Boxes;
  lite::Tensor* Scores = param.Scores;

-
  Boxes->mutable_data<float>();
  Scores->mutable_data<float>();

@@ -45,16 +44,14 @@ void YoloBoxCompute::PrepareForRun() {

  pe_.init();
  pe_.apply();
-
 }

 void YoloBoxCompute::Run() {
-
  pe_.dispatch();

  zynqmp::YoloBoxParam& yolobox_param = pe_.param();
  yolobox_param.imgSize->saveToFile("img_size", true);
-//   exit(-1);
+  //   exit(-1);
  yolobox_param.outputBoxes->saveToFile("yolo_boxes", true);
  yolobox_param.outputScores->saveToFile("yolo_scores", true);
 }

--- a/lite/kernels/fpga/yolo_box_compute.h
+++ b/lite/kernels/fpga/yolo_box_compute.h
@@ -33,7 +33,7 @@ class YoloBoxCompute
  void PrepareForRun() override;
  void Run() override;

-  virtual ~YoloBoxCompute() {
+  virtual ~YoloBoxCompute(){

  };