diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 5cef0ec1a64e7e696d6b5c797e39918d6f1ee915..02db327cb3c261b31a80375b8b2062405a072c3e 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -70,10 +70,11 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
     DLOG << "Wrong ofm dimension";
   }
   auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
   ofm_tensor->set_type(typeid(half));
   ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }
 
 void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
@@ -89,10 +90,11 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
     DLOG << "Wrong ofm dimension";
   }
   auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
   ofm_tensor->set_type(typeid(half));
   ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }
 
 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
@@ -108,10 +110,11 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
     DLOG << "Wrong ofm dimension";
   }
   auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
   ofm_tensor->set_type(typeid(float));
   ofm_tensor->fpga_data_num = memory_size / sizeof(float);
+  fpga::fpga_flush(p, memory_size);
 }
 
 float filter_find_max(framework::Tensor *filter_tensor) {
@@ -463,9 +466,24 @@ void expand_EW_arg(EWAddArgs *arg) {
   uint64_t image_amount_per_row =
       align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
                  IMAGE_ALIGNMENT);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
+  //////////////////////////////////////////////////////////
+  // temporary modify for EW and DMA problem
+  uint64_t image_image_pixel = 0;
+  if ((args.image0.width * args.image0.channels) >= 24576) {
+    if ((args.image0.width * args.image0.channels) % 32 != 0) {
+      DLOG << "EW parameter can not be support";
+    } else {
+      image_amount_per_row = image_amount_per_row / 2;
+      image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                          ((uint64_t)(args.image0.width / 2) << 16) |
+                          (uint64_t)(args.image0.height * 2);
+    }
+  } else {
+    image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                        ((uint64_t)args.image0.width << 16) |
+                        (uint64_t)args.image0.height;
+  }
+  //////////////////////////////////////////////////////////
 
   (*arg).driver.image0_address_phy = image0_address_phy;
   (*arg).driver.image1_address_phy = image1_address_phy;
@@ -560,6 +578,18 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
         reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
     memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
     fpga_flush(arg->conv_arg[i].filter_address, filter_size);
+    // for test
+    //    {
+    //    static int cnt = 0;
+    //    if(cnt == 4){
+    //        int8_t result = 0;
+    //        std::string str = "fc_filter";
+    //      fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
+    //      filter_size, result);
+    //
+    //    }
+    //    cnt++;
+    //}
 
     size_t bs_size = 2 *
                      align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
@@ -570,6 +600,18 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
         reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
     memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
     fpga_flush(arg->conv_arg[i].sb_address, bs_size);
+    // for test
+    /*{
+    static int cnt = 0;
+    if(cnt == 4){
+        float result = 0;
+        std::string str = "fc_bs";
+      fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
+result);
+
+    }
+    cnt++;
+}*/
 
     if (n > 1) {
       arg->conv_arg[i].output.scale_address =
diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp
index 50341b75e129479e7f8d8ab4d9c200df574996cb..425d1d1b5c3d0304de06dae9bb0e9fcf32f4d957 100644
--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -268,6 +268,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
   quantize(data_in, data_size, max);
   char **quantize_data = (char **)data_in;  // NOLINT
   convert_fc_filter(quantize_data, num, chw);
+  convert_to_hwc(quantize_data, num, channel, height, width);
   align_element(quantize_data, num, chw);
   if (num_after_alignment != num) {
     align_num(quantize_data, num_per_div_before_alignment, num, chw);
@@ -316,7 +317,7 @@ void align_element_n(int16_t **data_in, int num, int height, int width) {
     }
 
     *data_in = data_tmp;
-    free(tmp);
+    fpga_free(tmp);
   }
 }
 void quantize_to_fp16(float **data_in, int num, int height, int width,
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 93078d9d4d7f253e37969d770bc5022e03430ab6..fe95e6c7f3e913e0c0801b0371ffe5a179fb77ff 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -90,11 +90,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
     InitMemory();
   }
 
-#ifdef PADDLE_MOBILE_FPGA
-  program_.scope->EraseVars({"feed", "fetch"});
-  program_.scope->print_vars();
-#endif
-
   int count = 0;
   for (auto &op_handler : ops_of_block0_) {
     DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
@@ -514,6 +509,32 @@ PMStatus Executor<Device, T>::Predict() {
   return PMSuccess;
 }
 
+template <typename Device, typename T>
+void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
+  auto input_size = v.size();
+  auto *feed_var = program_.scope->Var("feed");
+
+  PADDLE_MOBILE_ENFORCE(input_size == feed_indices_.size(),
+                        "input data number not correct");
+  for (int i = 0; i < input_size; i++) {
+    framework::LoDTensor &target =
+        feed_var->template GetMutable<framework::LoDTensorArray>()->at(i);
+    target.ShareDataWith(v[input_size - i - 1]);
+  }
+}
+
+template <typename Device, typename T>
+void Executor<Device, T>::GetTensorResults(
+    std::vector<framework::Tensor *> *v) {
+  auto *fetch_var = program_.scope->Var("fetch");
+  auto output_size = fetch_indices_.size();
+  for (int i = 0; i < output_size; i++) {
+    framework::LoDTensor &target =
+        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(i);
+    v->push_back(&target);
+  }
+}
+
 #ifdef PADDLE_MOBILE_FPGA
 template <typename Device, typename T>
 void Executor<Device, T>::InjectVariable(const Tensor &t,
@@ -559,19 +580,6 @@ void Executor<Device, T>::GetResults(std::vector<void *> *v) {
   }
 }
 
-template <typename Device, typename T>
-void Executor<Device, T>::GetTensorResults(
-    std::vector<framework::Tensor *> *v) {
-  int index = 0;
-  auto vars = program_.scope->VarContain("fetch", &index);
-  auto output_size = vars.size();
-  for (int i = 0; i < output_size; i++) {
-    auto var = program_.scope->Var("fetch", i + index);
-    auto fetch_tensor = var->template GetMutable<LoDTensor>();
-    v->push_back(fetch_tensor);
-  }
-}
-
 template <typename Device, typename T>
 framework::Tensor *Executor<Device, T>::GetTensorByName(
     const std::string &name) {
diff --git a/src/framework/executor.h b/src/framework/executor.h
index 074bc4179ade271683a5454edf024661732d270d..fa589880c14fe6477d4443108ca2c436c1987c48 100644
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -51,15 +51,15 @@ class Executor {
 
   std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);
 
+  void FeedTensorData(const std::vector<framework::Tensor> &v);
+  void GetTensorResults(std::vector<framework::Tensor *> *v);
+
 #ifdef PADDLE_MOBILE_FPGA
   void InjectVariable(const Tensor &t, std::string var_name);
   void FeedData(const Tensor &t);
   void FeedData(const std::vector<void *> &v);
-
   void GetResults(std::vector<void *> *v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
   framework::Tensor *GetTensorByName(const std::string &name);
-
   std::shared_ptr<Tensor> FetchResult(int id = -1);
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 74398bbc5b368236d56e5180452b5b05d7d156ad..d140603744ac00a46d54d668c2201d11c1e1d088 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -50,9 +50,6 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
       attrs_(attrs),
       scope_(scope) {
   CheckAllInputOutputSet();
-#ifdef PADDLE_MOBILE_FPGA
-  InsertTensors();
-#endif
 }
 
 template <typename Dtype>
@@ -72,6 +69,9 @@ void OperatorBase<Dtype>::Run() {
           var->template IsType<framework::LoDTensor>()) {
         const Tensor *tensor = var->template Get<framework::LoDTensor>();
         if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+#ifdef PADDLE_MOBILE_FPGA
+        DLOG << var_vec_in[i];
+#endif
       }
     }
   }
@@ -83,6 +83,9 @@ void OperatorBase<Dtype>::Run() {
           var->template IsType<framework::LoDTensor>()) {
         const Tensor *tensor = var->template Get<framework::LoDTensor>();
         if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+#ifdef PADDLE_MOBILE_FPGA
+        DLOG << var_vec_out[i];
+#endif
       }
     }
   }
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index 5839a279cdfc03472628cf7650b30064281a226e..1f4769b282385207a5b53d6d678364393d7da6cc 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -146,7 +146,7 @@ void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
     tensors[i].init(typeid(float));
     ConvertPaddleTensors(inputs[i], &tensors[i]);
   }
-  // paddle_mobile_->FeedTensorData(tensors);
+  paddle_mobile_->FeedTensorData(tensors);
 }
 
 template <typename Device, typename T>
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 412c5687b778ced55498f47197fbab51fac0cea5..bf9749393b154f5a1484a95852c2bad300037344 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -236,6 +236,11 @@ template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
   executor_->FeedData(v);
 }
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::FeedTensorData(
+    const std::vector<framework::Tensor> &v) {
+  executor_->FeedTensorData(v);
+}
 
 template <typename Device, typename T>
 void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 6439c4cea94fbc8474caa7115afbc58b964e21ad..b05485fcae954e2aa2540ba81110fe36e6421019 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -91,6 +91,7 @@ class PaddleMobile {
   void InjectVariable(const framework::Tensor &t, std::string var_name);
   void FeedData(const framework::Tensor &t);
   void FeedData(const std::vector<void *> &v);
+  void FeedTensorData(const std::vector<framework::Tensor> &v);
 
   void GetResults(std::vector<void *> *v);
   void GetTensorResults(std::vector<framework::Tensor *> *v);
diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp
index a661cd642c51a1baff2ac6ec97933831bd034c40..f57c517bb00b8d676beaabf24c662efcbe752aeb 100644
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -21,6 +21,7 @@ template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
   auto output = param->Out();
   int col = param->Col();
+  DLOG << "col = " << col;
   auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
   input->init(typeid(float));
   input->Resize(output->dims());
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index b128c8e3430b8a359a5ad9dbcba397ad0f2b6568..6fbd81ae7f527b6983e27d482498cb43f1ef93a4 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -19,6 +19,7 @@ template <>
 bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
   auto input = const_cast<LoDTensor *>(param->InputX());
   int col = param->Col();
+  DLOG << "col = " << col;
   auto output = &(param->Out()->at(col));
   if (input->type() == typeid(float)) {
     return true;
@@ -59,7 +60,11 @@ template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   auto input = const_cast<LoDTensor *>(param.InputX());
   int col = param.Col();
-  LoDTensor *out = &param.Out()->at(col);
+  auto output = &param.Out()->at(col);
+  if (input->type() == typeid(float)) {
+    output->ShareDataWith(*input);
+    return;
+  }
 
   fpga::BypassArgs args = param.fpga_bypass_args;
   auto input_address = (input->data<half>());
@@ -67,7 +72,7 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   float *outdata_ptr =
       reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
   const int num_th = 32;
-  if ((out->fpga_data_num) < num_th) {
+  if (output->fpga_data_num < num_th) {
     fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
 
     for (int idx = 0; idx < product(input->dims()); ++idx) {
@@ -77,14 +82,14 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   }
 
   fpga::PerformBypass(args);
-  auto outC = out->dims()[1];
-  auto outH = out->dims()[2];
-  auto outW = out->dims()[3];
+  auto outC = output->dims()[1];
+  auto outH = output->dims()[2];
+  auto outW = output->dims()[3];
 
   fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        out->fpga_data_num * sizeof(float));
+                        output->fpga_data_num * sizeof(float));
 
-  if (out->fpga_data_num != product(input->dims())) {
+  if (output->fpga_data_num != product(input->dims())) {
     float *data_tmp =
         reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
     dealign(outdata_ptr, data_tmp, outC, outH, outW);
@@ -92,7 +97,6 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
     free(data_tmp);
   }
 }
-
 template class FetchKernel<FPGA, float>;
 
 }  // namespace operators
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 2b9527bb64d41eda203d41dd24d875b7d7aa7843..a63af19380214f62bf73e8b7e4e292186862b8ec 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,23 +68,38 @@ endif ()
 
 list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet50 paddle-mobile)
+    #ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-resnet50 paddle-mobile)
 
-    ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-densebox paddle-mobile)
+    #ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-densebox paddle-mobile)
 
-    ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-rfcn paddle-mobile)
+    #ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-rfcn paddle-mobile)
 
-    ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-marker paddle-mobile)
+    #ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-marker paddle-mobile)
 
     ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp)
     target_link_libraries(test-rfcn-api paddle-mobile)
+
+    ADD_EXECUTABLE(test-mobilenet-api fpga/test_mobilenet_api.cpp)
+    target_link_libraries(test-mobilenet-api paddle-mobile)
+
+    ADD_EXECUTABLE(test-yolo-api fpga/test_yolo_api.cpp)
+    target_link_libraries(test-yolo-api paddle-mobile)
+
+    ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
+    target_link_libraries(test-marker-api paddle-mobile)
     
-    ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
-    target_link_libraries(test-marker2 paddle-mobile)
+    #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
+    #target_link_libraries(test-marker2 paddle-mobile)
+
+    #ADD_EXECUTABLE(test-mobilenet fpga/test_mobilenet_beijing.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-mobilenet paddle-mobile)
+
+    #ADD_EXECUTABLE(test-yolo fpga/test_yolo_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-yolo paddle-mobile)
 
     set(FOUND_MATCH ON)
 endif ()
diff --git a/test/fpga/test_marker_api.cpp b/test/fpga/test_marker_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5b2995676b7b2dad5f32ae51b4b6220fda4506d
--- /dev/null
+++ b/test/fpga/test_marker_api.cpp
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
+#include <iostream>
+#include "../../src/io/paddle_inference_api.h"
+
+using namespace paddle_mobile;
+using namespace paddle_mobile::fpga;
+
+static const char *g_image = "../models/marker/model/image.bin";
+static const char *g_model = "../models/marker/model/model";
+static const char *g_param = "../models/marker/model/params";
+
+static const char *g_image1 = "../models/marker2/model/marker.bin";
+static const char *g_model1 = "../models/marker2/model/model";
+static const char *g_param1 = "../models/marker2/model/params";
+
+void readStream(std::string filename, char *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in | std::ios::binary);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+
+  in.seekg(0, std::ios::end);  // go to the end
+  auto length = in.tellg();    // report location (this is the length)
+  in.seekg(0, std::ios::beg);  // go back to the beginning
+  in.read(buf, length);
+  in.close();
+}
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  signed char *tmp_data =
+      (signed char *)paddle_mobile::fpga::fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  paddle_mobile::fpga::fpga_free(tmp);
+}
+
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    float *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
+
+void dump_stride_float(std::string filename,
+                       paddle_mobile::PaddleTensor input_tensor) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
+  int c = (input_tensor.shape)[1];
+  int h = (input_tensor.shape)[2];
+  int w = (input_tensor.shape)[3];
+  int n = (input_tensor.shape)[0];
+  float *data_tmp =
+      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
+  // convert_to_chw(&data_ptr, c, h, w, data_tmp);
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int datasize = abs(c * h * w * n);
+  if (datasize == 0) {
+    std::cout << "wrong dump data size" << std::endl;
+    return;
+  }
+  for (int i = 0; i < datasize; i++) {
+    result = data_ptr[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+
+void dump_stride(std::string filename,
+                 paddle_mobile::PaddleTensor input_tensor) {
+  if (input_tensor.dtypeid == typeid(float)) {
+    dump_stride_float(filename, input_tensor);
+  } else {
+    std::cout << "only support dumping float data" << std::endl;
+  }
+}
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+PaddleMobileConfig GetConfig1() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model1;
+  config.param_file = g_param1;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+
+int main() {
+  open_device();
+
+  PaddleMobileConfig config1 = GetConfig1();
+  auto predictor1 =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config1);
+
+  std::cout << "Finishing loading model" << std::endl;
+  for (int i = 0; i < 1; ++i) {
+    int img_length1 = 144 * 14 * 14;
+    auto img1 =
+        reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
+    readStream(g_image1, reinterpret_cast<char *>(img1));
+
+    std::cout << "Finishing initializing data" << std::endl;
+    struct PaddleTensor t_img1;
+
+    t_img1.dtypeid = typeid(float);
+    t_img1.layout = LAYOUT_HWC;
+    t_img1.shape = std::vector<int>({1, 14, 14, 144});
+    t_img1.name = "Image information";
+    t_img1.data.Reset(img1, img_length1 * sizeof(float));
+    predictor1->FeedPaddleTensors({t_img1});
+
+    std::cout << "Finishing feeding data " << std::endl;
+
+    predictor1->Predict_From_To(0, -1);
+    std::cout << "Finishing predicting " << std::endl;
+
+    std::vector<paddle_mobile::PaddleTensor> v1;  // No need to initialize v
+    predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
+    std::cout << "Output number is " << v1.size() << std::endl;
+    for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) {
+      std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum);
+      dump_stride(dumpName, v1[fetchNum]);
+    }
+  }
+  /////////////////////////////////////
+
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  std::cout << "Finishing loading model" << std::endl;
+
+  float img_info[3] = {432, 1280, 1.0f};
+  int img_length = 432 * 1280 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, reinterpret_cast<char *>(img));
+
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img_info, t_img;
+  t_img_info.dtypeid = typeid(float);
+  t_img_info.layout = LAYOUT_HWC;
+  t_img_info.shape = std::vector<int>({1, 3});
+  t_img_info.name = "Image information";
+  t_img_info.data.Reset(img_info, 3 * sizeof(float));
+
+  t_img.dtypeid = typeid(float);
+  // quantize(&img, img_length);
+  // t_img.dtypeid = typeid(int8_t);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 432, 1280, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  // t_img.data.Reset(img, img_length * sizeof(int8_t));
+  // for(int i = 0; i < 100; ++i){
+  predictor->FeedPaddleTensors({t_img_info, t_img});
+
+  std::cout << "Finishing feeding data " << std::endl;
+
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+
+  std::vector<paddle_mobile::PaddleTensor> v;  // No need to initialize v
+  predictor->FetchPaddleTensors(&v);           // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
+    std::string dumpName = "marker_api_fetch_" + std::to_string(fetchNum);
+    dump_stride(dumpName, v[fetchNum]);
+  }
+  return 0;
+}
diff --git a/test/fpga/test_mobilenet_api.cpp b/test/fpga/test_mobilenet_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b372773937722942b70c584dda1eeb22339841f
--- /dev/null
+++ b/test/fpga/test_mobilenet_api.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
+#include <iostream>
+#include "../../src/io/paddle_inference_api.h"
+
+using namespace paddle_mobile;        // NOLINT
+using namespace paddle_mobile::fpga;  // NOLINT
+
+static const char *g_image = "../images/mobilenet_txtdata/1.txt";
+static const char *g_model = "../models/keycurve_l2_regular4_model/__model__";
+static const char *g_param =
+    "../models/keycurve_l2_regular4_model/model.params";
+
+void readStream(std::string filename, float *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    float *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
+
+void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
+  int c = (input_tensor.shape)[1];
+  int h = (input_tensor.shape)[2];
+  int w = (input_tensor.shape)[3];
+  int n = (input_tensor.shape)[0];
+  float *data_tmp =
+      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
+  convert_to_chw(&data_ptr, c, h, w, data_tmp);
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int datasize = abs(c * h * w * n);
+  if (datasize == 0) {
+    std::cout << "wrong dump data size" << std::endl;
+    return;
+  }
+  for (int i = 0; i < datasize; i++) {
+    result = data_tmp[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+
+void dump_stride(std::string filename, PaddleTensor input_tensor) {
+  if (input_tensor.dtypeid == typeid(float)) {
+    dump_stride_float(filename, input_tensor);
+  } else {
+    std::cout << "only support dumping float data" << std::endl;
+  }
+}
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+int main() {
+  open_device();
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<paddle_mobile::PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  std::cout << "Finishing loading model" << std::endl;
+  int img_length = 256 * 416 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, img);
+
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img;
+  t_img.dtype = FLOAT32;
+  t_img.dtypeid = typeid(float);
+  // quantize(&img, img_length);
+  // t_img.dtype = INT8;
+  // t_img.dtypeid = typeid(int8_t);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 256, 416, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  // t_img.data.Reset(img, img_length * sizeof(int8_t));
+  predictor->FeedPaddleTensors({t_img});
+
+  std::cout << "Finishing feeding data " << std::endl;
+
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+
+  std::vector<PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
+    std::string dumpName = "mobilenet_api_fetch_" + std::to_string(fetchNum);
+    dump_stride(dumpName, v[fetchNum]);
+  }
+  return 0;
+}
diff --git a/test/fpga/test_rfcn_api.cpp b/test/fpga/test_rfcn_api.cpp
index f787d8f9acfe85ead101aeb16a4fbebe1aefee65..a52be938dee6df098a1f20a5df5cd852e0bfe37f 100644
--- a/test/fpga/test_rfcn_api.cpp
+++ b/test/fpga/test_rfcn_api.cpp
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
 #include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
+#include "../../src/io/paddle_inference_api.h"
 
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
+using namespace paddle_mobile;
+using namespace paddle_mobile::fpga;
 
-#include <string>
+static const char *g_image = "../models/rfcn/data.bin";
+static const char *g_model = "../models/rfcn/model";
+static const char *g_param = "../models/rfcn/params";
 
 void readStream(std::string filename, char *buf) {
   std::ifstream in;
@@ -37,116 +38,128 @@ void readStream(std::string filename, char *buf) {
   auto length = in.tellg();    // report location (this is the length)
   in.seekg(0, std::ios::beg);  // go back to the beginning
   in.read(buf, length);
-  DLOG << length;
   in.close();
 }
 
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int num, int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-}
-
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum, bool use_chw) {
-  // bool use_chw = true;
-  if (input_tensor.dims().size() != 4) return;
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  int n = (input_tensor.dims())[0];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  auto data_tmp = data_ptr_16;
-  if (use_chw) {
-    data_tmp =
-        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
-    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
-  }
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  if (data_tmp != data_ptr_16) {
-    free(data_tmp);
-  }
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
 }
 
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
+PaddleMobileConfig GetConfig1() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.model_dir = "../models/resnet50";
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.quantification = false;
+  return config;
 }
 
-void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
-                 bool use_chw) {
-  static int i = 0;
-  if (input_tensor.numel() == 0) {
-    return;
+int main() {
+  open_device();
+
+  PaddleMobileConfig config1 = GetConfig1();
+  auto predictor1 =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config1);
+
+  std::cout << "Finishing loading model" << std::endl;
+
+  int img_length1 = 224 * 224 * 3;
+  auto img1 =
+      reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
+
+  std::cout << "Finishing initializing data" << std::endl;
+
+  struct PaddleTensor t_img1;
+
+  t_img1.dtypeid = typeid(float);
+  t_img1.layout = LAYOUT_HWC;
+  t_img1.shape = std::vector<int>({1, 224, 224, 3});
+  t_img1.name = "Image information";
+  t_img1.data.Reset(img1, img_length1 * sizeof(float));
+  predictor1->FeedPaddleTensors({t_img1});
+  predictor1->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+
+  std::vector<PaddleTensor> v1;         // No need to initialize v
+  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
+  std::cout << "Output number is " << v1.size() << std::endl;
+  std::cout << "out[0] length " << v1[0].data.length() << std::endl;
+
+  ////////////////////////////
+
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  std::cout << "Finishing loading model" << std::endl;
+
+  float img_info[3] = {432, 1280, 1.0f};
+  int img_length = 432 * 1280 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, reinterpret_cast<char *>(img));
+
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img_info, t_img;
+  t_img.dtypeid = typeid(float);
+  t_img_info.layout = LAYOUT_HWC;
+  t_img_info.shape = std::vector<int>({1, 3});
+  t_img_info.name = "Image information";
+  t_img_info.data.Reset(img_info, 3 * sizeof(float));
+
+  t_img.dtypeid = typeid(float);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 432, 1280, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  predictor->FeedPaddleTensors({t_img_info, t_img});
+
+  std::cout << "Finishing feeding data " << std::endl;
+
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+
+  std::vector<PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  std::cout << "out[0] length " << v[0].data.length() << std::endl;
+  std::cout << "out[1] length " << v[1].data.length() << std::endl;
+  std::cout << "out[2] length " << v[2].data.length() << std::endl;
+
+  auto post_nms = v[0].data.length() / sizeof(float) / 8;
+  for (int num = 0; num < post_nms; num++) {
+    for (int i = 0; i < 8; i++) {
+      auto p = reinterpret_cast<float *>(v[0].data.data());
+      std::cout << p[num * 8 + i] << std::endl;
+    }
   }
-  if (input_tensor.type() == typeid(float)) {
-    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
-
-    dump_stride_float(filename, input_tensor, dumpnum);
-  } else {
-    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
-
-    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
+  for (int num = 0; num < post_nms; num++) {
+    for (int i = 0; i < 8; i++) {
+      auto p = reinterpret_cast<float *>(v[1].data.data());
+      std::cout << p[num * 8 + i] << std::endl;
+    }
   }
-  DLOG << "dump input address: " << input_tensor.get_data();
-}
-
-static const char *g_rfcn_combine = "../models/rfcn";
-static const char *g_image_src_float = "../models/rfcn/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-
-  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
-                         std::string(g_rfcn_combine) + "/params", true, false,
-                         1, true)) {
-    float img_info[3] = {768, 1536, 768.0f / 960.0f};
-    auto img = reinterpret_cast<float *>(
-        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
-    readStream(g_image_src_float, reinterpret_cast<char *>(img));
-
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData(std::vector<void *>({img_info, img}));
-    paddle_mobile.Predict_To(-1);
-
-    for (int i = 65; i < 69; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "rfcn_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(float));
-      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true);
+  for (int num = 0; num < post_nms; num++) {
+    for (int i = 0; i < 4; i++) {
+      auto p = reinterpret_cast<float *>(v[2].data.data());
+      std::cout << p[num * 4 + i] << std::endl;
     }
-    //   paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
   }
+  std::cout << "Finish getting vector values" << std::endl;
 
   return 0;
 }
diff --git a/test/fpga/test_yolo_api.cpp b/test/fpga/test_yolo_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ef890506eb1c40638242b9767267756a64da787
--- /dev/null
+++ b/test/fpga/test_yolo_api.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
+#include <iostream>
+#include "../../src/io/paddle_inference_api.h"
+
+using namespace paddle_mobile;        // NOLINT
+using namespace paddle_mobile::fpga;  // NOLINT
+
+static const char *g_image = "../images/yolo_test_txtimg/1.txt";
+static const char *g_model = "../models/yolo_bn_l2_model/__model__";
+static const char *g_param = "../models/yolo_bn_l2_model/model.params";
+
+void readStream(std::string filename, float *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    float *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
+
+void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
+  int c = (input_tensor.shape)[1];
+  int h = (input_tensor.shape)[2];
+  int w = (input_tensor.shape)[3];
+  int n = (input_tensor.shape)[0];
+  float *data_tmp =
+      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
+  convert_to_chw(&data_ptr, c, h, w, data_tmp);
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int datasize = abs(c * h * w * n);
+  if (datasize == 0) {
+    std::cout << "wrong dump data size" << std::endl;
+    return;
+  }
+  for (int i = 0; i < datasize; i++) {
+    result = data_tmp[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+
+void dump_stride(std::string filename, PaddleTensor input_tensor) {
+  if (input_tensor.dtypeid == typeid(float)) {
+    dump_stride_float(filename, input_tensor);
+  } else {
+    std::cout << "only support dumping float data" << std::endl;
+  }
+}
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+
+int main() {
+  open_device();
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  std::cout << "Finishing loading model" << std::endl;
+  int img_length = 256 * 416 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, img);
+
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img;
+  // t_img.dtype = FLOAT32;
+  // t_img.dtypeid = typeid(float);
+  quantize(&img, img_length);
+  t_img.dtype = INT8;
+  t_img.dtypeid = typeid(int8_t);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 256, 416, 3});
+  t_img.name = "Image information";
+  // t_img.data.Reset(img, img_length * sizeof(float));
+  t_img.data.Reset(img, img_length * sizeof(int8_t));
+  predictor->FeedPaddleTensors({t_img});
+
+  std::cout << "Finishing feeding data " << std::endl;
+
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+
+  std::vector<PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
+    std::string dumpName = "yolo_api_fetch_" + std::to_string(fetchNum);
+    dump_stride(dumpName, v[fetchNum]);
+  }
+  return 0;
+}