Change interfaces functions for FPGA track

4da635f8 · zhangyang0701 · 68ccfd58 · 4da635f8 · 4da635f8 · 4da635f8
15 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -70,10 +70,11 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
  ofm_tensor->set_type(typeid(half));
  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }
 void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
@@ -89,10 +90,11 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
  ofm_tensor->set_type(typeid(half));
  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }
 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
@@ -108,10 +110,11 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
  ofm_tensor->set_type(typeid(float));
  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
+  fpga::fpga_flush(p, memory_size);
 }
 float filter_find_max(framework::Tensor *filter_tensor) {
@@ -463,9 +466,24 @@ void expand_EW_arg(EWAddArgs *arg) {
  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
                 IMAGE_ALIGNMENT);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+  //////////////////////////////////////////////////////////
-                               ((uint64_t)args.image0.width << 16) |
+  // temporary modify for EW and DMA problem
-                               (uint64_t)args.image0.height;
+  uint64_t image_image_pixel = 0;
+  if ((args.image0.width * args.image0.channels) >= 24576) {
+    if ((args.image0.width * args.image0.channels) % 32 != 0) {
+      DLOG << "EW parameter can not be support";
+    } else {
+      image_amount_per_row = image_amount_per_row / 2;
+      image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                          ((uint64_t)(args.image0.width / 2) << 16) |
+                          (uint64_t)(args.image0.height * 2);
+    }
+  } else {
+    image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                        ((uint64_t)args.image0.width << 16) |
+                        (uint64_t)args.image0.height;
+  }
+  //////////////////////////////////////////////////////////
  (*arg).driver.image0_address_phy = image0_address_phy;
  (*arg).driver.image1_address_phy = image1_address_phy;
@@ -560,6 +578,18 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
+    // for test
+    //    {
+    //    static int cnt = 0;
+    //    if(cnt == 4){
+    //        int8_t result = 0;
+    //        std::string str = "fc_filter";
+    //      fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
+    //      filter_size, result);
+    //
+    //    }
+    //    cnt++;
+    //}
    size_t bs_size = 2 *
                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
@@ -570,6 +600,18 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
+    // for test
+    /*{
+    static int cnt = 0;
+    if(cnt == 4){
+        float result = 0;
+        std::string str = "fc_bs";
+      fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
+result);
+    }
+    cnt++;
+}*/
    if (n > 1) {
      arg->conv_arg[i].output.scale_address =

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -268,6 +268,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  quantize(data_in, data_size, max);
  char **quantize_data = (char **)data_in;  // NOLINT
  convert_fc_filter(quantize_data, num, chw);
+  convert_to_hwc(quantize_data, num, channel, height, width);
  align_element(quantize_data, num, chw);
  if (num_after_alignment != num) {
    align_num(quantize_data, num_per_div_before_alignment, num, chw);
@@ -316,7 +317,7 @@ void align_element_n(int16_t **data_in, int num, int height, int width) {
    }
    *data_in = data_tmp;
-    free(tmp);
+    fpga_free(tmp);
  }
 }
 void quantize_to_fp16(float **data_in, int num, int height, int width,

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -89,11 +89,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
    InitMemory();
  }
-#ifdef PADDLE_MOBILE_FPGA
-  program_.scope->EraseVars({"feed", "fetch"});
-  program_.scope->print_vars();
-#endif
  int count = 0;
  for (auto &op_handler : ops_of_block0_) {
    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
@@ -512,6 +507,33 @@ PMStatus Executor<Device, T>::Predict() {
  return PMSuccess;
 }
+template <typename Device, typename T>
+void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
+  auto input_size = v.size();
+  auto *feed_var = program_.scope->Var("feed");
+  PADDLE_MOBILE_ENFORCE(input_size == feed_indices_.size(),
+                        "input data number not correct");
+  for (int i = 0; i < input_size; i++) {
+    framework::LoDTensor &target =
+        feed_var->template GetMutable<framework::LoDTensorArray>()->at(i);
+    target.ShareDataWith(v[input_size - i - 1]);
+    DLOG << target.external_data;
+  }
+}
+template <typename Device, typename T>
+void Executor<Device, T>::GetTensorResults(
+    std::vector<framework::Tensor *> *v) {
+  auto *fetch_var = program_.scope->Var("fetch");
+  auto output_size = fetch_indices_.size();
+  for (int i = 0; i < output_size; i++) {
+    framework::LoDTensor &target =
+        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(i);
+    v->push_back(&target);
+  }
+}
 #ifdef PADDLE_MOBILE_FPGA
 template <typename Device, typename T>
 void Executor<Device, T>::InjectVariable(const Tensor &t,
@@ -557,19 +579,6 @@ void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  }
 }
-template <typename Device, typename T>
-void Executor<Device, T>::GetTensorResults(
-    std::vector<framework::Tensor *> *v) {
-  int index = 0;
-  auto vars = program_.scope->VarContain("fetch", &index);
-  auto output_size = vars.size();
-  for (int i = 0; i < output_size; i++) {
-    auto var = program_.scope->Var("fetch", i + index);
-    auto fetch_tensor = var->template GetMutable<LoDTensor>();
-    v->push_back(fetch_tensor);
-  }
-}
 template <typename Device, typename T>
 framework::Tensor *Executor<Device, T>::GetTensorByName(
    const std::string &name) {

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -51,15 +51,15 @@ class Executor {
  std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);
+  void FeedTensorData(const std::vector<framework::Tensor> &v);
+  void GetTensorResults(std::vector<framework::Tensor *> *v);
 #ifdef PADDLE_MOBILE_FPGA
  void InjectVariable(const Tensor &t, std::string var_name);
  void FeedData(const Tensor &t);
  void FeedData(const std::vector<void *> &v);
  void GetResults(std::vector<void *> *v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
  framework::Tensor *GetTensorByName(const std::string &name);
  std::shared_ptr<Tensor> FetchResult(int id = -1);
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -50,9 +50,6 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
      attrs_(attrs),
      scope_(scope) {
  CheckAllInputOutputSet();
-#ifdef PADDLE_MOBILE_FPGA
-  InsertTensors();
-#endif
 }
 template <typename Dtype>
@@ -72,6 +69,9 @@ void OperatorBase<Dtype>::Run() {
          var->template IsType<framework::LoDTensor>()) {
        const Tensor *tensor = var->template Get<framework::LoDTensor>();
        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+#ifdef PADDLE_MOBILE_FPGA
+        DLOG << var_vec_in[i];
+#endif
      }
    }
  }
@@ -83,6 +83,9 @@ void OperatorBase<Dtype>::Run() {
          var->template IsType<framework::LoDTensor>()) {
        const Tensor *tensor = var->template Get<framework::LoDTensor>();
        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+#ifdef PADDLE_MOBILE_FPGA
+        DLOG << var_vec_out[i];
+#endif
      }
    }
  }

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -146,7 +146,7 @@ void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
    tensors[i].init(typeid(float));
    ConvertPaddleTensors(inputs[i], &tensors[i]);
  }
-  // paddle_mobile_->FeedTensorData(tensors);
+  paddle_mobile_->FeedTensorData(tensors);
 }
 template <typename Device, typename T>

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -236,6 +236,11 @@ template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
  executor_->FeedData(v);
 }
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::FeedTensorData(
+    const std::vector<framework::Tensor> &v) {
+  executor_->FeedTensorData(v);
+}
 template <typename Device, typename T>
 void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -91,6 +91,7 @@ class PaddleMobile {
  void InjectVariable(const framework::Tensor &t, std::string var_name);
  void FeedData(const framework::Tensor &t);
  void FeedData(const std::vector<void *> &v);
+  void FeedTensorData(const std::vector<framework::Tensor> &v);
  void GetResults(std::vector<void *> *v);
  void GetTensorResults(std::vector<framework::Tensor *> *v);

--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -21,6 +21,7 @@ template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
  auto output = param->Out();
  int col = param->Col();
+  DLOG << "col = " << col;
  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
  input->init(typeid(float));
  input->Resize(output->dims());

--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -19,6 +19,7 @@ template <>
 bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
  auto input = const_cast<LoDTensor *>(param->InputX());
  int col = param->Col();
+  DLOG << "col = " << col;
  auto output = &(param->Out()->at(col));
  if (input->type() == typeid(float)) {
    return true;
@@ -59,7 +60,11 @@ template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
  auto input = const_cast<LoDTensor *>(param.InputX());
  int col = param.Col();
-  LoDTensor *out = &param.Out()->at(col);
+  auto output = &param.Out()->at(col);
+  if (input->type() == typeid(float)) {
+    output->ShareDataWith(*input);
+    return;
+  }
  fpga::BypassArgs args = param.fpga_bypass_args;
  auto input_address = (input->data<half>());
@@ -67,7 +72,7 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
  float *outdata_ptr =
      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
  const int num_th = 32;
-  if ((out->fpga_data_num) < num_th) {
+  if (output->fpga_data_num < num_th) {
    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
    for (int idx = 0; idx < product(input->dims()); ++idx) {
@@ -77,14 +82,14 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
  }
  fpga::PerformBypass(args);
-  auto outC = out->dims()[1];
+  auto outC = output->dims()[1];
-  auto outH = out->dims()[2];
+  auto outH = output->dims()[2];
-  auto outW = out->dims()[3];
+  auto outW = output->dims()[3];
  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        out->fpga_data_num * sizeof(float));
+                        output->fpga_data_num * sizeof(float));
-  if (out->fpga_data_num != product(input->dims())) {
+  if (output->fpga_data_num != product(input->dims())) {
    float *data_tmp =
        reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
    dealign(outdata_ptr, data_tmp, outC, outH, outW);
@@ -92,7 +97,6 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
    free(data_tmp);
  }
 }
 template class FetchKernel<FPGA, float>;
 }  // namespace operators

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,23 +68,38 @@ endif ()
 list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    #ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet50 paddle-mobile)
+    #target_link_libraries(test-resnet50 paddle-mobile)
-    ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    #ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-densebox paddle-mobile)
+    #target_link_libraries(test-densebox paddle-mobile)
-    ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
+    #ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-rfcn paddle-mobile)
+    #target_link_libraries(test-rfcn paddle-mobile)
-    ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
+    #ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-marker paddle-mobile)
+    #target_link_libraries(test-marker paddle-mobile)
    ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp)
    target_link_libraries(test-rfcn-api paddle-mobile)
+    ADD_EXECUTABLE(test-mobilenet-api fpga/test_mobilenet_api.cpp)
+    target_link_libraries(test-mobilenet-api paddle-mobile)
+    ADD_EXECUTABLE(test-yolo-api fpga/test_yolo_api.cpp)
+    target_link_libraries(test-yolo-api paddle-mobile)
+    ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
+    target_link_libraries(test-marker-api paddle-mobile)
-    ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
+    #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
-    target_link_libraries(test-marker2 paddle-mobile)
+    #target_link_libraries(test-marker2 paddle-mobile)
+    #ADD_EXECUTABLE(test-mobilenet fpga/test_mobilenet_beijing.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-mobilenet paddle-mobile)
+    #ADD_EXECUTABLE(test-yolo fpga/test_yolo_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    #target_link_libraries(test-yolo paddle-mobile)
    set(FOUND_MATCH ON)
 endif ()

--- a/test/fpga/test_marker_api.cpp
+++ b/test/fpga/test_marker_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
+#include <iostream>
+#include "../../src/io/paddle_inference_api.h"
+using namespace paddle_mobile;
+using namespace paddle_mobile::fpga;
+static const char *g_image = "../models/marker/model/image.bin";
+static const char *g_model = "../models/marker/model/model";
+static const char *g_param = "../models/marker/model/params";
+static const char *g_image1 = "../models/marker2/model/marker.bin";
+static const char *g_model1 = "../models/marker2/model/model";
+static const char *g_param1 = "../models/marker2/model/params";
+void readStream(std::string filename, char *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in | std::ios::binary);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  in.seekg(0, std::ios::end);  // go to the end
+  auto length = in.tellg();    // report location (this is the length)
+  in.seekg(0, std::ios::beg);  // go back to the beginning
+  in.read(buf, length);
+  in.close();
+}
+signed char float_to_int8(float fdata) {
+    if (fdata < 0.0) {
+        fdata -= 0.5;
+    } else {
+        fdata += 0.5;
+    }
+    return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size) {
+    float *tmp = *data_in;
+    signed char *tmp_data =
+            (signed char *)paddle_mobile::fpga::fpga_malloc(data_size * sizeof(char));
+    for (int i = 0; i < data_size; i++) {
+        tmp_data[i] = float_to_int8((*data_in)[i] + 128);
+    }
+    *data_in = (float *)tmp_data;  // NOLINT
+    paddle_mobile::fpga::fpga_free(tmp);
+}
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    float *data_tmp) {
+    int64_t amount_per_side = width * height;
+    for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+            for (int c = 0; c < channel; c++) {
+                *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+            }
+        }
+    }
+}
+void dump_stride_float(std::string filename, paddle_mobile::PaddleTensor input_tensor) {
+    auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
+    int c = (input_tensor.shape)[1];
+    int h = (input_tensor.shape)[2];
+    int w = (input_tensor.shape)[3];
+    int n = (input_tensor.shape)[0];
+    float *data_tmp =
+            reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
+    //convert_to_chw(&data_ptr, c, h, w, data_tmp);
+    std::ofstream out(filename.c_str());
+    float result = 0;
+    int datasize = abs(c * h * w * n);
+    if (datasize == 0) {
+        std::cout << "wrong dump data size" << std::endl;
+        return;
+    }
+    for (int i = 0; i < datasize; i++) {
+        result = data_ptr[i];
+        out << result << std::endl;
+    }
+    out.close();
+}
+void dump_stride(std::string filename, paddle_mobile::PaddleTensor input_tensor) {
+    if (input_tensor.dtypeid == typeid(float)) {
+        dump_stride_float(filename, input_tensor);
+    } else {
+        std::cout << "only support dumping float data" << std::endl;
+    }
+}
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+PaddleMobileConfig GetConfig1() {
+    PaddleMobileConfig config;
+    config.precision = PaddleMobileConfig::FP32;
+    config.device = PaddleMobileConfig::kFPGA;
+    config.prog_file = g_model1;
+    config.param_file = g_param1;
+    config.thread_num = 1;
+    config.batch_size = 1;
+    config.optimize = true;
+    config.lod_mode = true;
+    config.quantification = false;
+    return config;
+}
+int main() {
+  open_device();
+  PaddleMobileConfig config1 = GetConfig1();
+  auto predictor1 =
+            CreatePaddlePredictor<PaddleMobileConfig,
+                    PaddleEngineKind::kPaddleMobile>(config1);
+  std::cout << "Finishing loading model" << std::endl;
+for(int i = 0; i < 1; ++i){
+  int img_length1 = 144 * 14 * 14;
+  auto img1 = reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
+  readStream(g_image1, reinterpret_cast<char *>(img1));
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor  t_img1;
+  t_img1.dtypeid = typeid(float);
+  t_img1.layout = LAYOUT_HWC;
+  t_img1.shape = std::vector<int>({1, 14, 14, 144});
+  t_img1.name = "Image information";
+  t_img1.data.Reset(img1, img_length1 * sizeof(float));
+  predictor1->FeedPaddleTensors({t_img1});
+  std::cout << "Finishing feeding data " << std::endl;
+  predictor1->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+  std::vector<paddle_mobile::PaddleTensor> v1;        // No need to initialize v
+  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
+  std::cout << "Output number is " << v1.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) {
+      std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum);
+      dump_stride(dumpName, v1[fetchNum]);
+  }
+}
+/////////////////////////////////////
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+  CreatePaddlePredictor<PaddleMobileConfig,
+  PaddleEngineKind::kPaddleMobile>(config);
+  std::cout << "Finishing loading model" << std::endl;
+  float img_info[3] = {432, 1280, 1.0f};
+  int img_length = 432 * 1280 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, reinterpret_cast<char *>(img));
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img_info, t_img;
+  t_img_info.dtypeid = typeid(float);
+  t_img_info.layout = LAYOUT_HWC;
+  t_img_info.shape = std::vector<int>({1, 3});
+  t_img_info.name = "Image information";
+  t_img_info.data.Reset(img_info, 3 * sizeof(float));
+  t_img.dtypeid = typeid(float);
+  //quantize(&img, img_length);
+  //t_img.dtypeid = typeid(int8_t);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 432, 1280, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  //t_img.data.Reset(img, img_length * sizeof(int8_t));
+// for(int i = 0; i < 100; ++i){
+  predictor->FeedPaddleTensors({t_img_info, t_img});
+  std::cout << "Finishing feeding data " << std::endl;
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+  std::vector<paddle_mobile::PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
+    std::string dumpName = "marker_api_fetch_" + std::to_string(fetchNum);
+    dump_stride(dumpName, v[fetchNum]);
+  }
+  return 0;
+}
--- a/test/fpga/test_mobilenet_api.cpp
+++ b/test/fpga/test_mobilenet_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
+#include <iostream>
+#include "../../src/io/paddle_inference_api.h"
+using namespace paddle_mobile;  //NOLINT
+using namespace paddle_mobile::fpga;  //NOLINT
+static const char *g_image = "../images/mobilenet_txtdata/1.txt";
+static const char *g_model = "../models/keycurve_l2_regular4_model/__model__";
+static const char *g_param =
+    "../models/keycurve_l2_regular4_model/model.params";
+void readStream(std::string filename, float *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    float *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
+void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
+  int c = (input_tensor.shape)[1];
+  int h = (input_tensor.shape)[2];
+  int w = (input_tensor.shape)[3];
+  int n = (input_tensor.shape)[0];
+  float *data_tmp =
+      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
+  convert_to_chw(&data_ptr, c, h, w, data_tmp);
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int datasize = abs(c * h * w * n);
+  if (datasize == 0) {
+    std::cout << "wrong dump data size" << std::endl;
+    return;
+  }
+  for (int i = 0; i < datasize; i++) {
+    result = data_tmp[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+void dump_stride(std::string filename, PaddleTensor input_tensor) {
+  if (input_tensor.dtypeid == typeid(float)) {
+    dump_stride_float(filename, input_tensor);
+  } else {
+    std::cout << "only support dumping float data" << std::endl;
+  }
+}
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+int main() {
+  open_device();
+  PaddleMobileConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<paddle_mobile::PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(config);
+  std::cout << "Finishing loading model" << std::endl;
+  int img_length = 256 * 416 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, img);
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img;
+  t_img.dtype = FLOAT32;
+  t_img.dtypeid = typeid(float);
+  // quantize(&img, img_length);
+  // t_img.dtype = INT8;
+  // t_img.dtypeid = typeid(int8_t);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 256, 416, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  // t_img.data.Reset(img, img_length * sizeof(int8_t));
+  predictor->FeedPaddleTensors({t_img});
+  std::cout << "Finishing feeding data " << std::endl;
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+  std::vector<PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
+    std::string dumpName = "mobilenet_api_fetch_" + std::to_string(fetchNum);
+    dump_stride(dumpName, v[fetchNum]);
+  }
+  return 0;
+}
--- a/test/fpga/test_rfcn_api.cpp
+++ b/test/fpga/test_rfcn_api.cpp
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
 #include <iostream>
-#include "../test_helper.h"
+#include "../../src/io/paddle_inference_api.h"
-#include "../test_include.h"
-#ifdef PADDLE_MOBILE_FPGA_V1
+using namespace paddle_mobile;
-#include "fpga/V1/api.h"
+using namespace paddle_mobile::fpga;
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-#include <string>
+static const char *g_image = "../models/rfcn/data.bin";
+static const char *g_model = "../models/rfcn/model";
+static const char *g_param = "../models/rfcn/params";
 void readStream(std::string filename, char *buf) {
  std::ifstream in;
@@ -37,116 +38,128 @@ void readStream(std::string filename, char *buf) {
  auto length = in.tellg();    // report location (this is the length)
  in.seekg(0, std::ios::beg);  // go back to the beginning
  in.read(buf, length);
-  DLOG << length;
  in.close();
 }
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+PaddleMobileConfig GetConfig() {
-                    int num, int16_t *data_tmp) {
+  PaddleMobileConfig config;
-  int64_t amount_per_side = width * height;
+  config.precision = PaddleMobileConfig::FP32;
-  for (int n = 0; n < num; n++) {
+  config.device = PaddleMobileConfig::kFPGA;
-    for (int h = 0; h < height; h++) {
+  config.prog_file = g_model;
-      for (int w = 0; w < width; w++) {
+  config.param_file = g_param;
-        for (int c = 0; c < channel; c++) {
+  config.thread_num = 1;
-          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
+  config.batch_size = 1;
-            width * h + w) = *((*data_in)++);
+  config.optimize = true;
-        }
+  config.lod_mode = true;
-      }
+  config.quantification = false;
-    }
+  return config;
-  }
-}
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum, bool use_chw) {
-  // bool use_chw = true;
-  if (input_tensor.dims().size() != 4) return;
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  int n = (input_tensor.dims())[0];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  auto data_tmp = data_ptr_16;
-  if (use_chw) {
-    data_tmp =
-        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
-    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
-  }
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  if (data_tmp != data_ptr_16) {
-    free(data_tmp);
-  }
 }
-void dump_stride_float(std::string filename, Tensor input_tensor,
+PaddleMobileConfig GetConfig1() {
-                       const int dumpnum) {
+  PaddleMobileConfig config;
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  config.precision = PaddleMobileConfig::FP32;
-  std::ofstream out(filename.c_str());
+  config.device = PaddleMobileConfig::kFPGA;
-  float result = 0;
+  config.model_dir = "../models/resnet50";
-  int stride = input_tensor.numel() / dumpnum;
+  config.thread_num = 1;
-  stride = stride > 0 ? stride : 1;
+  config.batch_size = 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
+  config.optimize = true;
-    result = data_ptr[i];
+  config.quantification = false;
-    out << result << std::endl;
+  return config;
-  }
-  out.close();
 }
-void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
+int main() {
-                 bool use_chw) {
+  open_device();
-  static int i = 0;
-  if (input_tensor.numel() == 0) {
+  PaddleMobileConfig config1 = GetConfig1();
-    return;
+  auto predictor1 =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config1);
+  std::cout << "Finishing loading model" << std::endl;
+  int img_length1 = 224 * 224 * 3;
+  auto img1 =
+      reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img1;
+  t_img1.dtypeid = typeid(float);
+  t_img1.layout = LAYOUT_HWC;
+  t_img1.shape = std::vector<int>({1, 224, 224, 3});
+  t_img1.name = "Image information";
+  t_img1.data.Reset(img1, img_length1 * sizeof(float));
+  predictor1->FeedPaddleTensors({t_img1});
+  predictor1->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+  std::vector<PaddleTensor> v1;         // No need to initialize v
+  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
+  std::cout << "Output number is " << v1.size() << std::endl;
+  std::cout << "out[0] length " << v1[0].data.length() << std::endl;
+  ////////////////////////////
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+  std::cout << "Finishing loading model" << std::endl;
+  float img_info[3] = {432, 1280, 1.0f};
+  int img_length = 432 * 1280 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, reinterpret_cast<char *>(img));
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img_info, t_img;
+  t_img.dtypeid = typeid(float);
+  t_img_info.layout = LAYOUT_HWC;
+  t_img_info.shape = std::vector<int>({1, 3});
+  t_img_info.name = "Image information";
+  t_img_info.data.Reset(img_info, 3 * sizeof(float));
+  t_img.dtypeid = typeid(float);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 432, 1280, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  predictor->FeedPaddleTensors({t_img_info, t_img});
+  std::cout << "Finishing feeding data " << std::endl;
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+  std::vector<PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  std::cout << "out[0] length " << v[0].data.length() << std::endl;
+  std::cout << "out[1] length " << v[1].data.length() << std::endl;
+  std::cout << "out[2] length " << v[2].data.length() << std::endl;
+  auto post_nms = v[0].data.length() / sizeof(float) / 8;
+  for (int num = 0; num < post_nms; num++) {
+    for (int i = 0; i < 8; i++) {
+      auto p = reinterpret_cast<float *>(v[0].data.data());
+      std::cout << p[num * 8 + i] << std::endl;
+    }
  }
-  if (input_tensor.type() == typeid(float)) {
+  for (int num = 0; num < post_nms; num++) {
-    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
+    for (int i = 0; i < 8; i++) {
+      auto p = reinterpret_cast<float *>(v[1].data.data());
-    dump_stride_float(filename, input_tensor, dumpnum);
+      std::cout << p[num * 8 + i] << std::endl;
-  } else {
+    }
-    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
-    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
  }
-  DLOG << "dump input address: " << input_tensor.get_data();
+  for (int num = 0; num < post_nms; num++) {
-}
+    for (int i = 0; i < 4; i++) {
+      auto p = reinterpret_cast<float *>(v[2].data.data());
-static const char *g_rfcn_combine = "../models/rfcn";
+      std::cout << p[num * 4 + i] << std::endl;
-static const char *g_image_src_float = "../models/rfcn/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
-                         std::string(g_rfcn_combine) + "/params", true, false,
-                         1, true)) {
-    float img_info[3] = {768, 1536, 768.0f / 960.0f};
-    auto img = reinterpret_cast<float *>(
-        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
-    readStream(g_image_src_float, reinterpret_cast<char *>(img));
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData(std::vector<void *>({img_info, img}));
-    paddle_mobile.Predict_To(-1);
-    for (int i = 65; i < 69; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "rfcn_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(float));
-      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true);
    }
-    //   paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
  }
+  std::cout << "Finish getting vector values" << std::endl;
  return 0;
 }
--- a/test/fpga/test_yolo_api.cpp
+++ b/test/fpga/test_yolo_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_MOBILE_FPGA
+#define PADDLE_MOBILE_FPGA
+#endif
+#include <fstream>
+#include <iostream>
+#include "../../src/io/paddle_inference_api.h"
+using namespace paddle_mobile;  //NOLINT
+using namespace paddle_mobile::fpga;  //NOLINT
+static const char *g_image = "../images/yolo_test_txtimg/1.txt";
+static const char *g_model = "../models/yolo_bn_l2_model/__model__";
+static const char *g_param = "../models/yolo_bn_l2_model/model.params";
+void readStream(std::string filename, float *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+void quantize(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  signed char *tmp_data =
+      (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    float *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
+void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
+  int c = (input_tensor.shape)[1];
+  int h = (input_tensor.shape)[2];
+  int w = (input_tensor.shape)[3];
+  int n = (input_tensor.shape)[0];
+  float *data_tmp =
+      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
+  convert_to_chw(&data_ptr, c, h, w, data_tmp);
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int datasize = abs(c * h * w * n);
+  if (datasize == 0) {
+    std::cout << "wrong dump data size" << std::endl;
+    return;
+  }
+  for (int i = 0; i < datasize; i++) {
+    result = data_tmp[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+void dump_stride(std::string filename, PaddleTensor input_tensor) {
+  if (input_tensor.dtypeid == typeid(float)) {
+    dump_stride_float(filename, input_tensor);
+  } else {
+    std::cout << "only support dumping float data" << std::endl;
+  }
+}
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kFPGA;
+  config.prog_file = g_model;
+  config.param_file = g_param;
+  config.thread_num = 1;
+  config.batch_size = 1;
+  config.optimize = true;
+  config.lod_mode = true;
+  config.quantification = false;
+  return config;
+}
+int main() {
+  open_device();
+  PaddleMobileConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(config);
+  std::cout << "Finishing loading model" << std::endl;
+  int img_length = 256 * 416 * 3;
+  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+  readStream(g_image, img);
+  std::cout << "Finishing initializing data" << std::endl;
+  struct PaddleTensor t_img;
+  // t_img.dtype = FLOAT32;
+  // t_img.dtypeid = typeid(float);
+  quantize(&img, img_length);
+  t_img.dtype = INT8;
+  t_img.dtypeid = typeid(int8_t);
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1, 256, 416, 3});
+  t_img.name = "Image information";
+  // t_img.data.Reset(img, img_length * sizeof(float));
+  t_img.data.Reset(img, img_length * sizeof(int8_t));
+  predictor->FeedPaddleTensors({t_img});
+  std::cout << "Finishing feeding data " << std::endl;
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+  std::vector<PaddleTensor> v;        // No need to initialize v
+  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
+  std::cout << "Output number is " << v.size() << std::endl;
+  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
+    std::string dumpName = "yolo_api_fetch_" + std::to_string(fetchNum);
+    dump_stride(dumpName, v[fetchNum]);
+  }
+  return 0;
+}