diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 9607961c4785f631afb4b5e207ebff2c8e33623e..b8f131634e9eb4c56218db8f0643f10834089393 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -30,9 +30,10 @@ void format_image(framework::Tensor *image_tensor) {
   auto data_ptr = image_tensor->data<float>();
   auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
   float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-  float *old_p = p_data;
+
   image::format_image(&p_data, channel, height, width);
-  if (old_p != p_data) {
+
+  if (p_data != data_ptr && external_ptr == nullptr) {
     image_tensor->reset_data_ptr(p_data);
   }
 }
@@ -48,9 +49,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
   size_t memory_size = 0;
   if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
+    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
+    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
+                  sizeof(half);
   } else if (dims.size() == 2) {
     memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
   } else {
@@ -713,7 +714,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
     }
 
     for (int j = 0; j < split_num; ++j) {
-      // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
       arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
           activation_enable;
       arg->split_conv_args[i]
@@ -775,19 +775,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
       fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
                  filter_size);
 
-      /*{
-      static int cnt = 0;
-      std::string str = "deconv_filter";
-      if(cnt <= 1){
-          cnt++;
-          str += std::to_string(cnt);
-          int8_t result = 0;
-          fpga::savefile<int8_t>(str,
-      arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result);
-      }
-
-      }*/
-
       size_t bs_align_num = align_to_x(
           arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
       size_t bs_size = 2 * bs_align_num * sizeof(float);
@@ -803,20 +790,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
       memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
       fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
 
-      /*  {
-            static int cnt = 0;
-            std::string str = "deconv_sb";
-            if(cnt <= 1){
-                cnt++;
-                str += std::to_string(cnt);
-                float result = 0;
-                fpga::savefile<float>(str,
-         arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num,
-         result);
-            }
-
-            }*/
-
       if (split_num == 1) {
         arg->split_conv_args[i]->conv_arg[j].output.address =
             arg->split_conv_args[i]->output.address;
@@ -863,10 +836,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
                      int16_t leaky_relu_negative_slope, int stride_h,
                      int stride_w, int padding_h, int padding_w,
                      float *bias_ptr) {
+  auto deleter = [](void *p) { fpga_free(p); };
+  arg->vector_dwconv_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
+
   auto filter_ptr = filter->data<int16_t>();
   auto input_ptr = input->data<half>();
-  auto output_ptr = out->data<half>();
-
+  auto output_ptr = out->mutable_data<half>();
   arg->sub_conv_num = 1;
   // arg->relu_enabled = relu_enabled;
   arg->output.activation.activation_type = activation_enable;
diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp
index a52521b8470886c3ee2d3c4979d513a6e8b5aa93..de905f39a244d955011c4e879bd080a53ed66d01 100644
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -49,6 +49,9 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
 
   fpga::format_image(input);
   auto input_ptr = input->data<float>();
+  auto external_ptr = reinterpret_cast<float *>(input->external_data);
+  float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
+
   auto output_ptr = output->data<half>();
 
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
@@ -57,7 +60,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
   args.output_data_type = fpga::DATA_TYPE_FP16;
   args.input_layout_type = fpga::LAYOUT_CHW;
   args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input_ptr;
+  args.image.address = p_data;
   args.image.channels = (uint32_t)input->dims()[1];
   args.image.height = (uint32_t)input->dims()[2];
   args.image.width = (uint32_t)input->dims()[3];
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index b575d952371c5352d2d23d465b08d7749b82d140..ad3bcfbaa0ec96545007459ceda20bc13c7efe4b 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -56,8 +56,9 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
     return;
   }
   fpga::BypassArgs args = param.fpga_bypass_args;
-  auto data = (input->mutable_data<half>());
-  args.image.address = static_cast<void *>(data);
+  auto input_address = (input->data<half>());
+  args.image.address = static_cast<void*>(input_address);
+
   fpga::PerformBypass(args);
   fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
                         param.fpga_bypass_args.image.channels * sizeof(float));
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fdd7c46fedc98b3f1811cd10ffe6bcec7d0e3a46..3af55f075805361fd0cff40ab2e53752ea63f781 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -77,6 +77,10 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-rfcn paddle-mobile)
 
+    ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-marker paddle-mobile)
+
+
     set(FOUND_MATCH ON)
 endif ()
 
diff --git a/test/fpga/test_marker.cpp b/test/fpga/test_marker.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5abbcd3629c5c084f1258f5140f3190d99bf2344
--- /dev/null
+++ b/test/fpga/test_marker.cpp
@@ -0,0 +1,167 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+#include <iostream>
+#include <string>
+
+
+void readStream(std::string filename, char *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in|std::ios::binary);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+
+  in.seekg(0, std::ios::end);    // go to the end
+  auto length = in.tellg();           // report location (this is the length)
+  in.seekg(0, std::ios::beg);    // go back to the beginning
+  in.read(buf, length);
+  DLOG << length;
+  in.close();
+}
+
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,int num,
+                    int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+for(int n = 0; n < num; n++){
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + n* amount_per_side*channel +  c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+ }
+}
+
+
+
+
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum, bool use_chw) {
+ // bool use_chw = true;
+  if(input_tensor.dims().size()!=4)return;
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  int n = (input_tensor.dims())[0];
+  auto data_ptr = input_tensor.get_data();
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
+  auto data_tmp = data_ptr_16;
+  if (use_chw){
+  data_tmp =
+  reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
+  convert_to_chw(&data_ptr_16, c, h, w,n,  data_tmp);
+  }
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
+  }
+  out.close();
+  if(data_tmp!=data_ptr_16){free(data_tmp);}
+}
+
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+
+
+void dump_stride(std::string filename, Tensor input_tensor,
+                      const int dumpnum, bool use_chw) {
+  static int i=0;
+  if (input_tensor.numel() == 0) {
+    return;
+  }
+  if(input_tensor.type() == typeid(float)){
+    DLOG << "op: " <<i++ << ", float data  "<< input_tensor.numel(); ;
+    dump_stride_float(filename, input_tensor,dumpnum);
+  }
+  else{
+    DLOG << "op: " <<i++ << ", half data  "<< input_tensor.numel();;
+    dump_stride_half(filename, input_tensor,dumpnum, use_chw);
+  }
+  DLOG << "dump input address: " << input_tensor.get_data();
+}
+
+static const char *g_marker_combine = "../models/marker/model";
+static const char *g_image_src_float = "../models/marker/model/input_0.bin";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+
+  //if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
+  //                       std::string(g_rfcn_combine) + "/params", true, false,
+    //                     1, true)) {
+    if(paddle_mobile.Load(std::string(g_marker_combine),true)){
+    float img_info[3] = {720, 1280, 800.0f / 960.0f};
+    auto img = reinterpret_cast<float*>(fpga::fpga_malloc(720 * 1280 * 3 * sizeof(float)));
+    readStream(g_image_src_float, reinterpret_cast<char *>(img));
+
+    std::vector<void *> v(3, nullptr);
+    paddle_mobile.FeedData({ img});
+    paddle_mobile.Predict_To(-1);
+
+    for (int i = 47; i < 52; i++) {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "marker_" + std::to_string(i);
+    //if(i != 58)
+	  paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),tensor_ptr->numel() * sizeof(float));
+        //                                   tensor_ptr->numel() * sizeof(float));
+	
+		dump_stride(saveName, (*tensor_ptr),tensor_ptr->numel(), true);//20);//tensor_ptr->numel());
+
+/*	float result = 0;
+	std::string str = "softmax_input_data";
+	float* data = static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() * sizeof(float)));
+	str = "softmax_output_data";
+	auto output_ptr = static_cast<half*>((*tensor_ptr).get_data());
+	for (int idx = 0; idx < tensor_ptr->numel(); ++idx)
+	{
+		data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
+	}
+	fpga::savefile<float>(str,data, tensor_ptr->numel(), result );   */
+ }
+
+ //   paddle_mobile.GetResults(&v);
+    DLOG << "Computation done";
+    fpga::fpga_free(img);
+  }
+
+  return 0;
+}