diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77a94bea1efcdafaa67b4c078bfb0a756f7b1cec..69352c59f0145e1cb418eb068632d048c16a5146 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,9 @@ if (WITH_PADDLE_MOBILE)
     return()
 endif(WITH_PADDLE_MOBILE)
 
+# set(CMAKE_BUILD_TYPE DEBUG)
+
+
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..c0510e0f381a2ba6ae355870752dcb7dae1bd93f 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <unordered_map>
 
@@ -37,8 +39,18 @@ class Debugger {
     }
   }
 
+  void tick(std::string key) {
+    float value = 0;
+    if (tick_tock_map.count(key) > 0) {
+      value += tick_tock_map[key] = value;
+    }
+  }
+
+  void tock(std::string key) {}
+
  private:
   std::unordered_map<std::string, bool> op_config;
+  std::unordered_map<std::string, float> tick_tock_map;
   Debugger() {
     op_config["concat"] = true;
     op_config["pooling"] = true;
diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp
index eddf5ca454cdc9e91f87d6e4f2c8dfc13f35fdc6..fe66c84143fbc05f0b52a11e2e315b7f3db9054c 100755
--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
diff --git a/lite/backends/fpga/KD/io.cpp b/lite/backends/fpga/KD/io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..705faefe7443270c40b7a7c88f517f5381d83919
--- /dev/null
+++ b/lite/backends/fpga/KD/io.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io.hpp"
+
+namespace paddle {
+namespace zynqmp {
+
+// FpgaIO::FpgaIO() {}
+// void FpgaIO::setMutex(std::mutex* mtx) { mtx_ = mtx; }
+
+// void FpgaIO::setConditionVariable(std::condition_variable* condition) {
+//   condition_ = condition;
+// }
+
+// void FpgaIO::lock() {
+//   if (mtx_ != nullptr && !locked_) {
+//     mtx_->lock();
+//     locked_ = true;
+//   }
+// }
+
+// void FpgaIO::unlock() {
+//   if (mtx_ != nullptr) {
+//     mtx_->unlock();
+//     condition_->notify_one();
+//   }
+//   locked_ = false;
+// }
+
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/io.hpp b/lite/backends/fpga/KD/io.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef828da41c50c9d5085a6696a89c76334a73a21f
--- /dev/null
+++ b/lite/backends/fpga/KD/io.hpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdio.h>
+// #include <condition_variable>
+// #include <mutex>
+
+namespace paddle {
+namespace zynqmp {
+
+class FpgaIO {
+ public:
+  static FpgaIO& get_instance() {
+    static FpgaIO s_instance;
+    return s_instance;
+  }
+
+  void allocData(size_t s) { data_ = new float[s]; }
+
+  float* getData() { return data_; }
+
+  // void setMutex(std::mutex* mtx);
+  // void setConditionVariable(std::condition_variable* condition);
+  // void lock();
+  // void unlock();
+
+ private:
+  // std::mutex* mtx_ = nullptr;
+  // std::condition_variable* condition_ = nullptr;
+  // bool locked_ = false;
+
+  float* data_ = nullptr;
+
+  FpgaIO();
+};
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
index da81565cf5ca152a54b6cc1514cb660589428439..7727345b1c138ba7d84bcbcd078badb2e2fb98d5 100755
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in,
   for (int n = 0; n < num; n++) {
     float* filter_start = data_in + n * chw;
     int8_t* quantized_start = quantized_data + n * chw;
-    quantize(filter_start, quantized_start, chw, max);
-    filter_max.push_back(1);
+    quantize(filter_start, quantized_start, chw, f_max);
+    filter_max.push_back(f_max);
   }
 
   int8_t* hwc_data =
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
index 1408a034cb6a975e32d92da0406f98df7f2409c1..7a2c92335788364426b82d60b6a1ad85e633021c 100755
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -205,7 +205,7 @@ int get_device_info(const struct DeviceInfo &args) {
 int perform_bypass(const struct BypassArgs &args) {
   int ret = -1;
   int size = args.image.channels * args.image.width * args.image.height;
-  int max_size = 1 << 21;
+  int max_size = 1 << 22;
 
   float times = 1.0 * size / max_size;
   int count = static_cast<int>(times);
diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
index a2b184e383aa600b1279197a115c58309e204a95..56433ac19f192ef65d2b3b10a10402e6c64624a2 100644
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <math.h>
+#include <cmath>
 #include <vector>
 
 #include "lite/backends/fpga/KD/pe.hpp"
@@ -38,7 +40,6 @@ class FullyConnectedPE : public PE {
     Tensor* input = param_.input;
     convParam_.input = param_.input;
     convParam_.output = param_.output;
-    // convParam_.relu = param_.relu;
     convParam_.activeParam.type = param_.activeParam.type;
     convParam_.groups = 1;
     convParam_.strides = {1, 1};
@@ -54,32 +55,42 @@ class FullyConnectedPE : public PE {
 
     int height = param_.input->shape().height();
     int width = param_.input->shape().width();
-    int filter_channel = chw / height / width;
+    // int filter_channel = chw / height / width;
 
     int channel = param_.output->shape().channel();
-    Shape shape(NCHW, {num, filter_channel, height, width});
-    Tensor* conv_filter = new Tensor();
-    float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
+    Shape shape(NCHW, {num, chw_aligned, 1, 1});
+    float* new_filter_data = conv_filter_.mutableData<float>(FP32, shape);
     float* filter_data = param_.filter->data<float>();
 
+    memset(new_filter_data, 0, num * chw_aligned * sizeof(float));
+
     for (int i = 0; i < num; i++) {
       for (int j = 0; j < chw; j++) {
         float scale = filter_data[j * num + i];
-        new_filter_data[i * chw + j] = scale;
+        new_filter_data[i * chw_aligned + j] = scale;
       }
     }
-
     conv_filter->flush();
     convParam_.filter = conv_filter;
 
-    Shape sb_shape(N, {channel});
+    conv_filter_.flush();
+    convParam_.filter = &conv_filter_;
+    // param_.filter->saveToFile("param_filter", true);
+    // conv_filter->saveToFile("conv_filter", true);
+    // exit(-1);
+
+    Shape sb_shape(N, {num});
     float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
     float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
 
-    for (int i = 0; i < channel; i++) {
+    for (int i = 0; i < num; i++) {
       scale_data[i] = 1.0f;
       bias_data[i] = param_.bias->data<float>()[i];
     }
+    // for (int i = 0; i < num; i++) {
+    //   scale_data[i] = 1.0f;
+    //   bias_data[i] = param_.bias->data<float>()[i];
+    // }
     convParam_.scale()->flush();
     convParam_.bias()->flush();
 
@@ -115,14 +126,41 @@ class FullyConnectedPE : public PE {
     output->flush();
     output->scale()[0] = max / 127.0f;
     output->scale()[1] = 127.0f / max;
+    output->saveToFile("cpu_compute", true);
+    // exit(-1);
+  }
+
+  void batch_to_w() {
+    ConvParam& convParam_ = convPE_.param();
+
+    int channel = param_.input->shape().channel();
+    param_.input->invalidate();
+
+    int remainder =
+        aligned_input_.shape().channel() - param_.input->shape().channel();
+
+    float max = 0;
+    for (int n = 0; n < param_.input->shape().num(); n++) {
+      memset(aligned_input_.data<float16>(),
+             0,
+             aligned_input_.shape().channel() * sizeof(float16));
+      memcpy(
+          aligned_input_.data<float16>() + n * aligned_input_.shape().channel(),
+          param_.input->data<float16>() + n * channel,
+          channel * sizeof(float16));
+      aligned_input_.copyScaleFrom(param_.input);
+      aligned_input_.flush();
+    }
+
+    convPE_.dispatch();
   }
 
   bool dispatch() {
-    // int num = param_.filter->shape().channel();
-    // if (num == 2) {
-    //   cpu_compute();
-    //   return 1;
-    // } else {
+    // batch_to_w();
+    // return 1;
+    // cpu_compute1();
+    // return 1;
+
     return convPE_.dispatch();
     // }
   }
@@ -131,7 +169,10 @@ class FullyConnectedPE : public PE {
 
  private:
   FullyConnectedParam param_;
+  Tensor aligned_input_;
+  Tensor aligned_output_;
   ConvPE convPE_;
+  Tensor conv_filter_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/input_pe.hpp b/lite/backends/fpga/KD/pes/input_pe.hpp
index 380c85e17e7de63486bbc93bb62ae8728286dd7a..d8f9a15c6a94f6869a588f758982800b35eecc40 100755
--- a/lite/backends/fpga/KD/pes/input_pe.hpp
+++ b/lite/backends/fpga/KD/pes/input_pe.hpp
@@ -29,6 +29,7 @@ class InputPE : public PE {
   }
 
   bool dispatch() {
+    // std::cout << "input_dispatch()\n";
     Tensor* input = param_.input;
     Tensor* output = param_.output;
 
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
index 2944691693b135a2d2df7b91ecbe0ef249b015d8..2d02d30fbae12efc372e58c2ad80348356a8f22d 100755
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
 
@@ -52,6 +53,12 @@ class OutputPE : public PE {
     memcpy(DLEngine::get_instance().out_data,
            output->data<void>(),
            output->shape().numel() * sizeof(float));
+
+    fpga_reset();
+
+    auto max = fpga_get_memory_size_max();
+    std::cout << "PL ===== Max: ===== :: " << max << std::endl;
+
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 988bc1bb507036de8f13a6c6549c549718bd1256..a19d55a64dede6e6e929d605359fcbab826c13e2 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -103,12 +103,18 @@ class Tensor {
     return reinterpret_cast<Dtype*>(ptr);
   }
 
+  void releaseData() {
+    released = true;
+    placeHolder_.reset();
+  }
+
   template <typename Dtype>
   Dtype* mutableData(DataType dataType, const Shape& shape) {
-    if (this->shape_ != nullptr) {
-      delete shape_;
-    }
-    this->shape_ = new Shape(shape);
+    // std::cout << "enter \n";
+    // std::cout << "before new shape\n";
+    // this->shape_ = new Shape(shape);
+    this->shape_.reset(new Shape(shape));
+    // std::cout << "new shape \n";
     this->dataType_ = dataType;
     return mutableData<Dtype>();
   }
@@ -117,11 +123,14 @@ class Tensor {
   Dtype* mutableData() {
     size_t memorySize =
         shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    // std::cout << "mem_size:" << memorySize << std::endl;
     if (placeHolder_ != nullptr) {
+      // std::cout << "placeHolder_ not null"<< std::endl;
       if (memorySize > placeHolder_->memorySize()) {
         placeHolder_.reset(new PlaceHolder(memorySize));
       }
     } else {
+      // std::cout << "placeHolder_ null"<< std::endl;
       placeHolder_.reset(new PlaceHolder(memorySize));
     }
     return data<Dtype>();
@@ -138,7 +147,7 @@ class Tensor {
 
   DataType dataType() { return this->dataType_; }
 
-  Shape& shape() { return *shape_; }
+  Shape& shape() { return *(shape_.get()); }
 
   bool aligned() { return this->aligned_; }
 
@@ -247,15 +256,17 @@ class Tensor {
   void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
 
   void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
-    if (shape_ != nullptr) {
-      delete shape_;
-    }
+    // if (shape_ != nullptr) {
+    //   delete shape_;
+    // }
+
     this->placeHolder_ = src->placeHolder_;
     this->dataType_ = src->dataType_;
     this->aligned_ = src->aligned_;
     this->dateLocation_ = src->dateLocation_;
     this->offset = offset;
-    shape_ = new Shape(const_cast<Shape&>(shape));
+    // shape_ = new Shape(const_cast<Shape&>(shape));
+    shape_.reset(new Shape(shape));
   }
 
   void copyFrom(Tensor* src) {
@@ -300,7 +311,13 @@ class Tensor {
   }
 
   void flush() {
-    size_t memorySize = placeHolder_->memorySize();
+    if (released) {
+      // std::cout << "flush::" << this << std::endl;
+      return;
+    }
+
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
     fpga_flush(placeHolder_->data(), memorySize);
   }
 
@@ -451,18 +468,13 @@ class Tensor {
     return os;
   }
 
-  ~Tensor() {
-    if (shape_ != nullptr) {
-      delete shape_;
-      shape_ = nullptr;
-    }
-  }
-
  private:
+  bool released = false;
   int offset = 0;
   float mem_scale_factor_ = 1.0f;
   std::shared_ptr<PlaceHolder> placeHolder_;
-  Shape* shape_ = nullptr;
+  std::shared_ptr<Shape> shape_;
+  // Shape* shape_ = nullptr;
   DataType dataType_ = FP32;
   bool aligned_ = false;
   DataSyncStatus synchedStatus_ = Synched;
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
index 7f1e8d3e17f97315e77532b77bbcfcc8331edd4f..6ec9f6866ab859460ebfa56a0e13800d45ba5d52 100755
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -69,7 +69,7 @@ std::string DDimLite::repr() const {
 }
 
 void TensorLite::ShareDataWith(const TensorLite &other) {
-  buffer_ = other.buffer_;
+  buffer_ = other.buffer_;  // TODO(chonwhite) delete buffer;
   dims_ = other.dims_;
   zynq_tensor_ = other.zynq_tensor_;
   target_ = other.target_;
@@ -79,10 +79,10 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
 }
 
 void *TensorLite::mutable_data(size_t memory_size) {
-  memory_size_ = memory_size;
+  memory_size_ = memory_size;  // TODO(chonwhite) delete buffer;
   buffer_->ResetLazy(target_, memory_size_);
   // throw -1;
-  std::cout << memory_size << std::endl;
+  // std::cout << memory_size << std::endl;
   return buffer_->data();
 }
 
@@ -92,16 +92,34 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
 }
 
 void TensorLite::CopyDataFrom(const TensorLite &other) {
+  // std::cout << "other11:: "<< &other << std::endl;
   dims_ = other.dims_;
   target_ = other.target_;
   lod_ = other.lod_;
-  auto dt = zynq_tensor_->dataType();
+  // std::cout << "before dataType\n";
 
-  auto shape = other.zynq_tensor_->shape();
+  if (zynq_tensor_.get() == nullptr) {
+    zynq_tensor_.reset(new zynqmp::Tensor());
+  }
 
+  auto dt = zynq_tensor_->dataType();
+  // std::cout << "after dataType\n";
+
+  // std::cout << "before resize\n";
   Resize(other.dims());
+  auto shape = other.zynq_tensor_->shape();
+  // std::cout << "after resize\n";
   zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
-  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+  // std::cout << "after mutableData\n";
+  // std::cout << "ZynqTensor():" << this->ZynqTensor() << std::endl;
+  // std::cout << "other Tensor():" << other.ZynqTensor() << std::endl;
+
+  // this->ZynqTensor()->copyFrom(other.ZynqTensor());
+  memcpy(this->ZynqTensor()->data<void>(),
+         other.ZynqTensor()->data<void>(),
+         other.ZynqTensor()->shape().numel() * sizeof(float));
+  // memcpy()
+  // std::cout << "after copyFrom\n";
 }
 
 }  // namespace lite
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index 311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0..f83bed541e59639daed83eefc503f8de6f05aef4 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -81,6 +81,10 @@ class DDimLite {
     return !(a == b);
   }
 
+  ~DDimLite() {
+    // std::cout << "free DDimLite\n";
+  }
+
  private:
   std::vector<value_type> data_;
 };
@@ -109,7 +113,12 @@ class TensorLite {
     return zynq_tensor_->data<R>() + offset_;
   }
 
-  void Resize(const DDimLite &ddim) { dims_ = ddim; }
+  void Resize(const DDimLite &ddim) {
+    // std::cout << "Resize \n";
+    // std::cout << "ddim:" << & ddim << std::endl;
+    dims_ = ddim;
+    // std::cout << "after Reize \n";
+  }
   void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
 
   const DDimLite &dims() const { return dims_; }
@@ -142,7 +151,9 @@ class TensorLite {
   void *mutable_data(size_t memory_size);
   void *mutable_data(TargetType target, size_t memory_size);
 
-  const void *raw_data() const { return buffer_->data(); }
+  const void *raw_data() const {
+    return buffer_->data();
+  }  // TODO(chonwhite) delete buffer;
 
   size_t data_size() const { return this->dims().production(); }
 
@@ -150,7 +161,9 @@ class TensorLite {
 
   size_t offset() const { return offset_; }
 
-  bool IsInitialized() const { return buffer_->data(); }
+  bool IsInitialized() const {
+    return buffer_->data();
+  }  // TODO(chonwhite) delete buffer;
 
   // Other share data to this.
   void ShareDataWith(const TensorLite &other);
@@ -165,7 +178,10 @@ class TensorLite {
 
   TargetType target() const { return target_; }
 
-  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
+  // template <typename T>
+  // TensorLite Slice(int64_t begin, int64_t end) const;
+
+  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); }
 
   friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
     os << "Tensor:" << '\n';
@@ -194,7 +210,8 @@ class TensorLite {
   size_t memory_size_{};
   size_t offset_{0};
 
-  zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
+  // zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
+  std::shared_ptr<zynqmp::Tensor> zynq_tensor_;
 
   template <typename T>
   void mutable_data_internal();
@@ -203,6 +220,7 @@ class TensorLite {
 template <typename T, typename R>
 R *TensorLite::mutable_data() {
   std::vector<int> v;
+  // std::cout << "mutable_data \n";
   for (int i = 0; i < dims_.size(); i++) {
     v.push_back(dims_[i]);
   }
@@ -225,7 +243,7 @@ R *TensorLite::mutable_data() {
       break;
   }
   zynqmp::Shape input_shape(layout_type, v);
-
+  // std::cout << "input_shape \n";
   zynqmp::DataType data_type = zynqmp::FP32;
   if (typeid(T) == typeid(float)) {
     data_type = zynqmp::FP32;
@@ -233,6 +251,13 @@ R *TensorLite::mutable_data() {
   if (typeid(T) == typeid(zynqmp::float16)) {
     data_type = zynqmp::FP16;
   }
+  // std::cout << "mutableData \n";
+  // std::cout << "zynq_tensor_:" << zynq_tensor_.get() << std::endl;
+
+  if (zynq_tensor_.get() == nullptr) {
+    zynq_tensor_.reset(new zynqmp::Tensor());
+  }
+
   return zynq_tensor_->mutableData<R>(data_type, input_shape);
 }
 
@@ -272,6 +297,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
 
 template <typename T>
 void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
+  // TODO(chonwhite) delete this function;
   CHECK_GE(begin, 0);
   CHECK_LE(end, dims_[0]);
   CHECK_LT(begin, end);
diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc
index 8fcbb8cffe72935e4df503c3c1748ddb68247fb7..93072fe499eed296d6e31d87ee9b74494de07aa1 100644
--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
@@ -59,6 +59,7 @@ void SequencePoolCompute::Run() {
   for (int i = 0; i <= batch_size; i++) {
     offset_new[i] = i;
   }
+  (output->mutable_lod())->clear();
   (output->mutable_lod())->push_back(offset_new);
 }
 
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index a017487b48b23b8b6096c901dc00d277f42d6aac..34125a0f47d373022f0e5e7828a2b11dc629cd48 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -14,7 +14,6 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
 add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 
 add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
index 79329e99a3e5e812dca487c17452f3f5d1e96449..9ca2424bc2f8a748c348cac4aafd219e538c7a17 100755
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -40,8 +40,8 @@ void FeedCompute::PrepareForRun() {
 void FeedCompute::Run() {
   auto& param = this->Param<param_t>();
   Tensor& x = param.feed_list->at(param.col);
+  pe_.param().input = x.ZynqTensor();
   pe_.dispatch();
-
   auto out_lod = param.out->mutable_lod();
   *out_lod = x.lod();
 
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
index 2d296f4d4a89b1fd86e5b2330d3caf44fbad0903..71ec37a64d94bcbef00d7e3c2a187bdb28c47935 100755
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -82,6 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
                      kNHWC,
                      paddle::lite::kernels::fpga::FetchCompute,
                      host_host)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
diff --git a/lite/kernels/fpga/mul_compute.cc b/lite/kernels/fpga/mul_compute.cc
index c27600d9f773ff0aae04a2ee519905bc0e58785c..659c8dfb653c1707105a7337493ee4f3b3357b76 100755
--- a/lite/kernels/fpga/mul_compute.cc
+++ b/lite/kernels/fpga/mul_compute.cc
@@ -80,7 +80,8 @@ void mul(MulCompute* k) {
 }
 
 void MulCompute::Run() {
-  pe_.dispatch();
+  // pe_.dispatch();
+  mul(this);
 #ifdef FPGA_PRINT_TENSOR
   zynqmp::FullyConnectedParam& fc_param = pe_.param();
   Debugger::get_instance().registerOutput("mul", fc_param.output);
diff --git a/lite/kernels/host/one_hot_compute.cc b/lite/kernels/host/one_hot_compute.cc
index e0af6f5173f367bb9b2e06de10499ee36806379c..e1bf4c103bcb59c59617d8c5d1ce10ae8780e403 100755
--- a/lite/kernels/host/one_hot_compute.cc
+++ b/lite/kernels/host/one_hot_compute.cc
@@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>
 
-#include "lite/backends/fpga/KD/debugger.hpp"
+// #include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/kernels/host/one_hot_compute.h"
 #include "lite/utils/paddle_enforce.h"
 
diff --git a/lite/operators/one_hot_op.cc b/lite/operators/one_hot_op.cc
index 023cdc23aeb8329736b7438af2c52cbfa899c75c..ebab9e20679038546611b8dc3221f4ecba1bbe21 100644
--- a/lite/operators/one_hot_op.cc
+++ b/lite/operators/one_hot_op.cc
@@ -15,7 +15,7 @@
 #include "lite/operators/one_hot_op.h"
 #include "lite/core/op_registry.h"
 
-#include "lite/backends/fpga/KD/debugger.hpp"
+// #include "lite/backends/fpga/KD/debugger.hpp"
 
 namespace paddle {
 namespace lite {