diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index c0510e0f381a2ba6ae355870752dcb7dae1bd93f..fa069688b7cfa80adc299fa9668bd2b045897292 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -48,26 +48,33 @@ class Debugger {
 
   void tock(std::string key) {}
 
+  void setEnable(bool en) { enabled_ = en; }
+
  private:
+  bool enabled_ = false;
+
   std::unordered_map<std::string, bool> op_config;
   std::unordered_map<std::string, float> tick_tock_map;
   Debugger() {
-    op_config["concat"] = true;
-    op_config["pooling"] = true;
-    op_config["conv"] = true;
-    op_config["dwconv"] = true;
-    op_config["ew_add"] = true;
-    op_config["crop"] = true;
-    op_config["feed"] = true;
-    op_config["mul"] = true;
-    op_config["fetch"] = true;
-    op_config["boxes"] = true;
-    op_config["scores"] = true;
-    op_config["nms"] = true;
-    op_config["pb_boxes"] = true;
-    op_config["pb_variances"] = true;
+    // op_config["concat"] = true;
+    // op_config["pooling"] = true;
+    // op_config["conv"] = true;
+    // op_config["dropout"] = true;
+    // op_config["dwconv"] = true;
+    // op_config["ew_add"] = true;
+    // op_config["ew_mul"] = true;
+    // op_config["crop"] = true;
+    // op_config["feed"] = true;
     // op_config["fc"] = true;
-    op_config["softmax"] = true;
+    // op_config["mul"] = true;
+    // op_config["fetch"] = true;
+    // op_config["boxes"] = true;
+    // op_config["scores"] = true;
+    // op_config["nms"] = true;
+    // op_config["pb_boxes"] = true;
+    // op_config["pb_variances"] = true;
+
+    // op_config["softmax"] = true;
   }
 };
 
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
index 7727345b1c138ba7d84bcbcd078badb2e2fb98d5..e09b9d67d1263278abcd84d6ab9d7e392ee94b48 100755
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -240,6 +240,8 @@ int8_t* format_filter(float* data_in,
   for (int n = 0; n < num; n++) {
     float* filter_start = data_in + n * chw;
     int8_t* quantized_start = quantized_data + n * chw;
+    // float f_max = find_max(filter_start, chw);
+    float f_max = max;
     quantize(filter_start, quantized_start, chw, f_max);
     filter_max.push_back(f_max);
   }
diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp
index 42ec32957e5884aaae3cc96f46060de114b44ead..222a788d351d9b3dd2cde7c595af898602990ea3 100644
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -83,26 +83,34 @@ struct ConvParam : PEParam {
   std::vector<int> kernelSize;
   std::vector<int> dilations;
 
-  Tensor* scale() { return scale_; }
+  Tensor* scale() { return &scale_; }
 
-  Tensor* bias() { return bias_; }
+  Tensor* bias() { return &bias_; }
 
   std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
 
+  ~ConvParam() {
+    for (int i = 0; i < splitParams_.size(); i++) {
+      BasicConvParam* basic_param = splitParams_[i];
+      delete basic_param;
+    }
+    splitParams_.clear();
+  }
+
  protected:
   std::vector<BasicConvParam*> splitParams_;
-  Tensor* scale_ = new Tensor();
-  Tensor* bias_ = new Tensor();
+  Tensor scale_;
+  Tensor bias_;
 };
 
 struct DepthwiseConvParam : ConvParam {
  public:
-  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* quantizedFilter() { return &quantizedFilter_; }
 
   DWconvArgs args;
 
  protected:
-  Tensor* quantizedFilter_ = new Tensor();
+  Tensor quantizedFilter_;
 };
 
 enum PoolingType : int {
@@ -142,7 +150,7 @@ struct ElementwiseAddParam : PEParam {
 
 struct ElementwiseMulParam : PEParam {
  public:
-  Tensor* input_x;
+  Tensor* input_x = nullptr;
   Tensor* input_y = nullptr;
   Tensor* output = nullptr;
 };
@@ -154,13 +162,13 @@ struct FullyConnectedParam : PEParam {
   Tensor* bias = nullptr;
   Tensor* output = nullptr;
 
-  Tensor* quantizedFilter() { return quantizedFilter_; }
+  Tensor* quantizedFilter() { return &quantizedFilter_; }
 
-  Tensor* biasScale() { return biasScale_; }
+  Tensor* biasScale() { return &biasScale_; }
 
  protected:
-  Tensor* quantizedFilter_ = new Tensor();
-  Tensor* biasScale_ = new Tensor();
+  Tensor quantizedFilter_;
+  Tensor biasScale_;
 };
 
 struct SoftmaxParam : PEParam {
@@ -193,10 +201,10 @@ struct NormParam : PEParam {
 };
 
 struct PriorBoxParam : PEParam {
-  Tensor* input;
-  Tensor* image;
-  Tensor* outputBoxes;
-  Tensor* outputVariances;
+  Tensor* input = nullptr;
+  Tensor* image = nullptr;
+  Tensor* outputBoxes = nullptr;
+  Tensor* outputVariances = nullptr;
 
   std::vector<float> minSizes;
   std::vector<float> maxSizes;
@@ -212,10 +220,10 @@ struct PriorBoxParam : PEParam {
 };
 
 struct YoloBoxParam : PEParam {
-  Tensor* input;
-  Tensor* imgSize;
-  Tensor* outputBoxes;
-  Tensor* outputScores;
+  Tensor* input = nullptr;
+  Tensor* imgSize = nullptr;
+  Tensor* outputBoxes = nullptr;
+  Tensor* outputScores = nullptr;
   int downsampleRatio;
   std::vector<int> anchors;
   int classNum;
@@ -229,15 +237,15 @@ struct ScaleParam : PEParam {
   Tensor* scale = nullptr;
   Tensor* bias = nullptr;
 
-  Tensor* alignedScale() { return alignedScale_; }
+  Tensor* alignedScale() { return &alignedScale_; }
 
-  Tensor* alignedBias() { return alignedBias_; }
+  Tensor* alignedBias() { return &alignedBias_; }
 
   ScaleArgs args = {0};
 
  protected:
-  Tensor* alignedScale_ = new Tensor();
-  Tensor* alignedBias_ = new Tensor();
+  Tensor alignedScale_;
+  Tensor alignedBias_;
 };
 
 struct ResizeParam : PEParam {
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
index b4eac2c41e138cab19197ccb8ab89681a69ec6fe..48fb16a7ecde7416fb32ee228c9dd26e9c0f2d5b 100644
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -195,16 +195,6 @@ class ConvPE : public PE {
       addPE_.init();
       addPE_.apply();
       addPE_.dispatch();
-
-      // param_.output->printScale();
-
-      // params[0]->input.saveToFile("conv_1.txt");
-      // params[1]->input.saveToFile("conv_2.txt");
-
-      // params[0]->output.saveToFile("ew_o1.txt");
-      // params[1]->output.saveToFile("ew_o2.txt");
-      // std::cout << "\n ================== EW ================== \n";
-      // }
     }
 
     return ret == 0;
@@ -212,6 +202,8 @@ class ConvPE : public PE {
 
   ConvParam& param() { return param_; }
 
+  ~ConvPE() {}
+
  private:
   bool use_cpu_ = false;
   bool split_channel = false;
diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp
index 5c125010c27615c545ba274b259f18c775db3d55..dfc70867735b18f10970864888eca88c7f2dc56e 100755
--- a/lite/backends/fpga/KD/pes/relu_pe.hpp
+++ b/lite/backends/fpga/KD/pes/relu_pe.hpp
@@ -23,43 +23,27 @@ class ReluPE : public PE {
  public:
   bool init() {
     Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
+    output->setAligned(param_.input->aligned());
+    output->setDataLocation(CPU);
     return true;
   }
 
-  void apply() {
-    Tensor* src = param_.input;
-
-    args_.input_data_type = DATA_TYPE_FP16;
-    args_.output_data_type = DATA_TYPE_FP16;
-    args_.input_layout_type = LAYOUT_HWC;
-    args_.output_layout_type = LAYOUT_HWC;
-    args_.image = {.address = src->data<void>(),
-                   .scale_address = src->scale(),
-                   .channels = (uint32_t)src->shape().channel(),
-                   .width = (uint32_t)src->shape().width(),
-                   .height = (uint32_t)src->shape().height(),
-                   .pad_width = 0u,
-                   .pad_height = 0u};
-    args_.output = {
-        .address = param_.output->data<void>(),
-        .scale_address = param_.output->scale(),
-    };
-
-    inplace_.relu_enable = false;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-  }
+  void apply() {}
 
   bool dispatch() {
-    inplace_.relu_enable = true;
-    config_inplace(inplace_);
-    param_.input->syncToDevice();
-    param_.output->copyFrom(param_.input);
-    param_.output->invalidate();
-    inplace_.relu_enable = false;
-    config_inplace(inplace_);
+    param_.input->invalidate();
+    int16_t* input_data = param_.input->data<int16_t>();
+    float16* out_data = param_.output->data<float16>();
+    for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) {
+      int16_t v = param_.input->data<float16>()[i];
+      if (v > 0) {
+        out_data[i] = input_data[i];
+      } else {
+        out_data[i] = zero;
+      }
+    }
+    param_.output->copyScaleFrom(param_.input);
+    param_.output->flush();
     return true;
   }
 
@@ -67,8 +51,7 @@ class ReluPE : public PE {
 
  private:
   InputParam param_;
-  BypassArgs args_;
-  InplaceArgs inplace_;
+  float16 zero = float_to_half(0.0f);
 };
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
index 09755c65a322da8ccab0d57dd2e877712b112361..5ff94edd747fe9f01741baf1efaad288ed32b98d 100755
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -36,6 +36,7 @@ class ScalePE : public PE {
   }
 
   inline int lcm(int a, int b) { return a * b / gcd(a, b); }
+
   bool init() {
     Tensor* output = param_.output;
     output->setAligned(true);
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 1e1793faae664ff7ea999b11d2d1cfb16e57390d..065495fd8571691196700cd9da23af282b882240 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -283,7 +283,6 @@ class Tensor {
         .address = data<void>(), .scale_address = scale(),
     };
     args.output = output;
-    src->syncToDevice();
     size_t aligned_remainder = src->shape().numel() % 16;
     if (aligned_remainder > 0) {
       size_t dtype_size =
@@ -293,7 +292,6 @@ class Tensor {
       fpga_flush(dst, aligned_remainder * dtype_size);
     }
     src->syncToDevice();
-    this->invalidate();
     perform_bypass(args);
     this->invalidate();
   }
@@ -303,8 +301,7 @@ class Tensor {
       return;
     }
 
-    size_t memorySize =
-        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    size_t memorySize = placeHolder_->memorySize();
     fpga_flush(placeHolder_->data(), memorySize);
   }
 
@@ -384,7 +381,6 @@ class Tensor {
   }
 
   void save_file_with_name(std::string path) {
-    invalidate();
     std::ofstream ofs;
     ofs.open(path);
     ofs << scale()[0] << " / " << scale()[1] << std::endl;
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index 34125a0f47d373022f0e5e7828a2b11dc629cd48..0af17ecbe76523b8dcff150863661da93b73d553 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 set(fpga_deps fpga_target_wrapper kernel_fpga)
 
 
-# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
+add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
 
 # add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
@@ -25,7 +25,7 @@ add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
 # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
 add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
-# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
+add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
 # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
 add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
 # add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc
index ecd9af0f8da5df62a15637e88dc4564efb187f6c..f6704204d34c309835c1de0ef61afed97c0b29e3 100644
--- a/lite/kernels/fpga/activation_compute.cc
+++ b/lite/kernels/fpga/activation_compute.cc
@@ -25,10 +25,10 @@ using float16 = zynqmp::float16;
 void ReluCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   auto output_data = param.Out->mutable_data<float16>();
-  zynqmp::InputParam& input_param = pe_.param();
+  zynqmp::InputParam& relu_param = pe_.param();
 
-  input_param.input = param.X->ZynqTensor();
-  input_param.output = param.Out->ZynqTensor();
+  relu_param.input = param.X->ZynqTensor();
+  relu_param.output = param.Out->ZynqTensor();
   pe_.init();
   pe_.apply();
 }
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
index d22cc7abacc2ecd80e54aa5c62a7e57671b920c9..1ef6d19e83ee71a8e81862c9a4837243f18675f7 100755
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -40,6 +40,7 @@ void ElementwiseAddCompute::PrepareForRun() {
   pe_.apply();
 }
 void ElementwiseAddCompute::Run() {
+  usleep(50 * 100 * 1000);
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ElementwiseAddParam& ew_param = pe_.param();
@@ -62,6 +63,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
   pe_.apply();
 }
 void ElementwiseAddActivationCompute::Run() {
+  usleep(500 * 100 * 1000);
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ElementwiseAddParam& ew_param = pe_.param();
@@ -80,21 +82,21 @@ void ElementwiseMulCompute::PrepareForRun() {
   scale_param.activeParam.type = zynqmp::TYPE_NONE;
 
   int channel = scale_param.input->shape().channel();
-  zynqmp::Tensor* scale = new zynqmp::Tensor();
-  zynqmp::Tensor* bias = new zynqmp::Tensor();
-  scale_param.scale = scale;
-  scale_param.bias = bias;
+  scale_param.scale = &scale_;
+  scale_param.bias = &bias_;
   zynqmp::Shape shape(zynqmp::N, {channel});
-  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
-  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+  zynqmp::float16* scale_data =
+      scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
+  zynqmp::float16* bias_data =
+      bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
   float scale_value = param.Y->data<float>()[0];
 
-  for (int i = 0; i < channel; ++i) {
+  for (int i = 0; i < channel; i++) {
     if (param.Y->dims().production() != 1) {
       scale_value = param.Y->ZynqTensor()->data<float>()[i];
     }
-    scale_data[i] = scale_value;
-    bias_data[i] = 0;
+    scale_data[i] = zynqmp::float_to_half(scale_value);
+    bias_data[i] = zero_;
   }
 
   pe_.init();
@@ -102,6 +104,10 @@ void ElementwiseMulCompute::PrepareForRun() {
 }
 
 void ElementwiseMulCompute::Run() {
+  auto& param = Param<operators::ElementwiseParam>();
+  param.Y->ZynqTensor()->flush();
+  scale_.copyFrom(param.Y->ZynqTensor());
+  scale_.invalidate();
   pe_.dispatch();
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::ScaleParam& scale_param = pe_.param();
diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h
index e3e9c52c4c660e9ae6852f2ec8cdd815829ad524..9fa4991161dff6bba6c860838863b1cb38393877 100644
--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
@@ -61,6 +61,9 @@ class ElementwiseMulCompute
 
  private:
   zynqmp::ScalePE pe_;
+  zynqmp::Tensor scale_;
+  zynqmp::Tensor bias_;
+  zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f);
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
index 71ec37a64d94bcbef00d7e3c2a187bdb28c47935..18aea40c4e27241e0113f326d9cc98bfccf30d2b 100755
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -55,6 +55,7 @@ void FetchCompute::Run() {
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::OutputParam& fetch_param = pe_.param();
   Debugger::get_instance().registerOutput("fetch", fetch_param.output);
+  Debugger::get_instance().setEnable(true);
 #endif
 }