update

4fb30240 · hjchen2 · 2bbf3ec6 · 4fb30240 · 4fb30240 · 4fb30240
25 changed file
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -31,7 +31,8 @@ namespace paddle_mobile {

 #ifdef ANDROID

-extern const char *ANDROID_LOG_TAG;
+static const char *ANDROID_LOG_TAG =
+    "paddle_mobile LOG built on " __DATE__ " " __TIME__;

 #define ANDROIDLOGI(...)                                               \
  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -531,20 +531,6 @@ void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
  }
 }

-template <typename Device, typename T>
-void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
-  auto input_size = v.size();
-  int index = 0;
-  auto vars = program_.scope->VarContain("feed", &index);
-  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
-                        "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    auto var = program_.scope->Var("feed", i + index);
-    auto feed_tensor = var->template GetMutable<LoDTensor>();
-    feed_tensor->ShareDataWith(v[i]);
-  }
-}
-
 template <typename Device, typename T>
 void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -53,7 +53,6 @@ class Executor {
  void InjectVariable(const Tensor &t, std::string var_name);
  void FeedData(const Tensor &t);
  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);

  void GetResults(std::vector<void *> *v);
  void GetTensorResults(std::vector<framework::Tensor *> *v);

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -146,7 +146,7 @@ void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
    tensors[i].init(typeid(float));
    ConvertPaddleTensors(inputs[i], &tensors[i]);
  }
-  paddle_mobile_->FeedTensorData(tensors);
+  // paddle_mobile_->FeedTensorData(tensors);
 }

 template <typename Device, typename T>

--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -39,8 +39,6 @@ using framework::Tensor;
 using paddle_mobile::CPU;
 using std::string;

-const char *ANDROID_LOG_TAG =
-    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
 paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 static std::mutex shared_mutex;


--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -91,7 +91,6 @@ class PaddleMobile {
  void InjectVariable(const framework::Tensor &t, std::string var_name);
  void FeedData(const framework::Tensor &t);
  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);

  void GetResults(std::vector<void *> *v);
  void GetTensorResults(std::vector<framework::Tensor *> *v);

--- a/src/operators/fusion_deconv_add_bn_op.h
+++ b/src/operators/fusion_deconv_add_bn_op.h
@@ -57,7 +57,7 @@ class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
  FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs,
                      const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddBNParam<DeviceType>,
            operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_bn_relu_op.h
+++ b/src/operators/fusion_deconv_add_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDeconvAddBNReluOp
  FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
+                          framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
            operators::DeconvAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_deconv_bn_relu_op.h
+++ b/src/operators/fusion_deconv_bn_relu_op.h
@@ -56,7 +56,7 @@ class FusionDeconvBNReluOp
  FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvBNReluParam<DeviceType>,
            operators::DeconvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -47,6 +47,7 @@ bool IsExpand(const std::vector<int64_t> &filter_dim,
  return !(filter_1 && strides_1 && padding_0 && dilation_1);
 }

+#ifdef PADDLE_MOBILE_CPU
 template <typename Itype, typename Otype>
 void GemmConv(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
@@ -241,6 +242,7 @@ template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
 template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
 template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
 #endif
+#endif

 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_kernel.cpp
@@ -24,8 +24,8 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
  paddle_mobile::fpga::ActivationType activation_enable =
      paddle_mobile::fpga::NONE;
  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
  int channel = out->dims()[1];
  auto bs_ptr =

--- a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
@@ -27,10 +27,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
  paddle_mobile::fpga::ActivationType activation_enable =
      paddle_mobile::fpga::NONE;
  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
  // const Tensor *bias = param->Bias();
  // auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();

  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],

--- a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
@@ -27,10 +27,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
  paddle_mobile::fpga::ActivationType activation_enable =
      paddle_mobile::fpga::NONE;
  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
  const Tensor *bias = param->InputBias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();

  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],

--- a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
@@ -28,10 +28,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
  paddle_mobile::fpga::ActivationType activation_enable =
      paddle_mobile::fpga::LEAKYRELU;
  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
  const Tensor *bias = param->InputBias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();

  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],

--- a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
@@ -29,10 +29,10 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
  paddle_mobile::fpga::ActivationType activation_enable =
      paddle_mobile::fpga::LEAKYRELU;
  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
  const Tensor *bias = param->InputBias();
  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
  auto out = param->Output();
  auto bn_mean_ptr = param->InputMean()->data<float>();
  auto bn_var_ptr = param->InputVariance()->data<float>();

--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -57,13 +57,9 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
 }
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<Tensor *>(param.InputX());
-  if (input->type() == typeid(float)) {
-    int col = param.Col();
-    auto output = &(param.Out()->at(col));
-    output->ShareDataWith(*input);
-    return;
-  }
+  auto input = const_cast<LoDTensor *>(param.InputX());
+  int col = param.Col();
+  LoDTensor *out = &param.Out()->at(col);

  fpga::BypassArgs args = param.fpga_bypass_args;
  auto input_address = (input->data<half>());
@@ -71,7 +67,7 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
  float *outdata_ptr =
      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
  const int num_th = 32;
-  if ((param.Out()->fpga_data_num) < num_th) {
+  if ((out->fpga_data_num) < num_th) {
    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));

    for (int idx = 0; idx < product(input->dims()); ++idx) {
@@ -81,14 +77,14 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
  }

  fpga::PerformBypass(args);
-  auto outC = param.Out()->dims()[1];
-  auto outH = param.Out()->dims()[2];
-  auto outW = param.Out()->dims()[3];
+  auto outC = out->dims()[1];
+  auto outH = out->dims()[2];
+  auto outW = out->dims()[3];

  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        param.Out()->fpga_data_num * sizeof(float));
+                        out->fpga_data_num * sizeof(float));

-  if (param.Out()->fpga_data_num != product(input->dims())) {
+  if (out->fpga_data_num != product(input->dims())) {
    float *data_tmp =
        reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
    dealign(outdata_ptr, data_tmp, outC, outH, outW);

--- a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
@@ -25,7 +25,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
      paddle_mobile::fpga::LEAKYRELU;
  int16_t leaky_relu_negative_slope = 0;
  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
  const Tensor *input_z = param->InputZ();
  auto input_z_ptr = input_z->data<float>();
  auto out = param->Out();

--- a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <>
-bool Pad2dKernel<FPGA, float>::Init(Pad2dParam<FPGA> *param) {
-  Tensor *output = param->Out();
+bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
+  Tensor *output = param->output_;
  fpga::format_fp16_ofm(output);
  return true;
 }
@@ -39,9 +39,9 @@ void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
  }
 }
 template <>
-void Pad2dKernel<FPGA, float>::Compute(const Pad2dParam<FPGA> &param) {
-  auto in_x = param.InputX();
-  auto out = param.Out();
+void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
+  auto in_x = param.input_;
+  auto out = param.output_;
  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
                        in_x->numel() * sizeof(half));
  pad2dFunc(in_x, out);

--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -68,7 +68,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {

 template <>
 void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
-  auto *input = const_cast<Tensor *>(param.Input());
+  auto *input = const_cast<LoDTensor *>(param.Input());

  if (input->type() == typeid(float)) {
    auto *output = param.Output();

--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
@@ -24,7 +24,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
  paddle_mobile::fpga::ActivationType activation_enable =
      paddle_mobile::fpga::SIGMOID;
  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->InputX());
+  auto input = const_cast<LoDTensor *>(param->InputX());
  auto input_ptr = input->data<half>();
  auto out = param->Out();
  fpga::format_fp16_ofm(out);

--- a/src/operators/math/channel_wise.h
+++ b/src/operators/math/channel_wise.h
@@ -33,7 +33,7 @@ void AddChannelWise(const framework::Tensor *input,
  // maybe check shape
  int batch_size = input->dims()[0];
  int channels = input->dims()[1];
-  size_t spatial_size = input->dims()[2] * input->dims()[3];
+  int spatial_size = input->dims()[2] * input->dims()[3];

  for (int batch = 0; batch < batch_size; ++batch) {
    for (int channel = 0; channel < channels; ++channel) {
@@ -88,7 +88,7 @@ void ScaleAddChannelWise(const framework::Tensor *input,
  // maybe check shape
  int batch_size = input->dims()[0];
  int channels = input->dims()[1];
-  size_t spatial_size = input->dims()[2] * input->dims()[3];
+  int spatial_size = input->dims()[2] * input->dims()[3];

  for (int batch = 0; batch < batch_size; ++batch) {
    for (int channel = 0; channel < channels; ++channel) {

--- a/src/operators/math/gemm/cblas.cc
+++ b/src/operators/math/gemm/cblas.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
 #pragma once

 #include "operators/math/gemm/cblas.h"
@@ -47,3 +49,5 @@ void cblas_sgemv(const bool trans, const int M, const int N, const float alpha,
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
--- a/src/operators/pad2d_op.cpp
+++ b/src/operators/pad2d_op.cpp
@@ -37,5 +37,8 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp);
+#endif

 #endif  // PAD2D_OP
--- a/test/fpga/test_rfcn_api.cpp
+++ b/test/fpga/test_rfcn_api.cpp
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
 #include <iostream>
-#include "io/paddle_inference_api.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif

-static const char *g_image = "../models/rfcn/data.bin";
-static const char *g_model = "../models/rfcn/model";
-static const char *g_param = "../models/rfcn/params";
+#include <string>

 void readStream(std::string filename, char *buf) {
  std::ifstream in;
@@ -35,137 +37,116 @@ void readStream(std::string filename, char *buf) {
  auto length = in.tellg();    // report location (this is the length)
  in.seekg(0, std::ios::beg);  // go back to the beginning
  in.read(buf, length);
+  DLOG << length;
  in.close();
 }

-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-PaddleMobileConfig GetConfig1() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.model_dir = "../models/resnet50";
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.quantification = false;
-  return config;
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+                    int num, int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
+  }
 }

-int main() {
-  open_device();
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img.dtypeid = typeid(float);
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = typeid(float);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  std::cout << "out[0] length " << v[0].data.length() << std::endl;
-  std::cout << "out[1] length " << v[1].data.length() << std::endl;
-  std::cout << "out[2] length " << v[2].data.length() << std::endl;
-
-  auto post_nms = v[0].data.length() / sizeof(float) / 8;
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[0].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum, bool use_chw) {
+  // bool use_chw = true;
+  if (input_tensor.dims().size() != 4) return;
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  int n = (input_tensor.dims())[0];
+  auto data_ptr = input_tensor.get_data();
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
+  auto data_tmp = data_ptr_16;
+  if (use_chw) {
+    data_tmp =
+        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
+    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
  }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[1].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
  }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 4; i++) {
-      auto p = reinterpret_cast<float *>(v[2].data.data());
-      std::cout << p[num * 4 + i] << std::endl;
-    }
+  out.close();
+  if (data_tmp != data_ptr_16) {
+    free(data_tmp);
  }
-  std::cout << "Finish getting vector values" << std::endl;
-
-  ////////////////////////////////////////////////////
+}

-  PaddleTensor tensor;
-  predictor->GetPaddleTensor("fetch2", &tensor);
-  for (int i = 0; i < post_nms; i++) {
-    auto p = reinterpret_cast<float *>(tensor.data.data());
-    std::cout << p[+i] << std::endl;
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
  }
+  out.close();
+}

-  //////////////////////////////////////////////////////
-
-  PaddleMobileConfig config1 = GetConfig1();
-  auto predictor1 =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config1);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  int img_length1 = 224 * 224 * 3;
-  auto img1 =
-      reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
-
-  std::cout << "Finishing initializing data" << std::endl;
+void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
+                 bool use_chw) {
+  static int i = 0;
+  if (input_tensor.numel() == 0) {
+    return;
+  }
+  if (input_tensor.type() == typeid(float)) {
+    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();

-  struct PaddleTensor t_img1;
+    dump_stride_float(filename, input_tensor, dumpnum);
+  } else {
+    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();

-  t_img1.dtypeid = typeid(float);
-  t_img1.layout = LAYOUT_HWC;
-  t_img1.shape = std::vector<int>({1, 224, 224, 3});
-  t_img1.name = "Image information";
-  t_img1.data.Reset(img1, img_length1 * sizeof(float));
-  predictor1->FeedPaddleTensors({t_img1});
-  predictor1->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
+    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
+  }
+  DLOG << "dump input address: " << input_tensor.get_data();
+}

-  std::vector<PaddleTensor> v1;         // No need to initialize v
-  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
-  std::cout << "Output number is " << v1.size() << std::endl;
-  std::cout << "out[0] length " << v1[0].data.length() << std::endl;
+static const char *g_rfcn_combine = "../models/rfcn";
+static const char *g_image_src_float = "../models/rfcn/data.bin";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+
+  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
+                         std::string(g_rfcn_combine) + "/params", true, false,
+                         1, true)) {
+    float img_info[3] = {768, 1536, 768.0f / 960.0f};
+    auto img = reinterpret_cast<float *>(
+        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
+    readStream(g_image_src_float, reinterpret_cast<char *>(img));
+
+    std::vector<void *> v(3, nullptr);
+    paddle_mobile.FeedData(std::vector<void *>({img_info, img}));
+    paddle_mobile.Predict_To(-1);
+
+    for (int i = 65; i < 69; i++) {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "rfcn_" + std::to_string(i);
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
+                                           tensor_ptr->numel() * sizeof(float));
+      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true);
+    }
+    //   paddle_mobile.GetResults(&v);
+    DLOG << "Computation done";
+    fpga::fpga_free(img);
+  }

  return 0;
 }
--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
@@ -36,7 +36,10 @@ int main(int argc, char* argv[]) {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
  paddle_mobile.SetThreadNum(thread_num);
  auto time1 = time();
-  if (paddle_mobile.Load(fluid_model, optimize)) {
+//  if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) {
+  if (paddle_mobile.Load(std::string(fluid_model) + "/model",
+                                 std::string(fluid_model) + "/params", optimize,
+                                 false, 1, true)) {
    auto time2 = time();
    std::cout << "load cost :" << time_diff(time1, time2) << "ms\n";
    paddle_mobile::framework::Tensor input;
@@ -51,14 +54,15 @@ int main(int argc, char* argv[]) {
    paddle_mobile::framework::DDim in_shape =
        paddle_mobile::framework::make_ddim(dims);
    SetupTensor<float>(&input, in_shape, 0.f, 255.f);
-    // warmup
-    for (int i = 0; i < 10; ++i) {
+//    // warmup
+    for (int i = 0; i < 2; ++i) {
      paddle_mobile.Predict(input);
    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input);
    }
+
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n";
    std::ostringstream os("output tensor size: ");
@@ -68,7 +72,7 @@ int main(int argc, char* argv[]) {
      os << ", " << output->data<float>()[i];
    }
    std::string output_str = os.str();
-    std::cout << output_str << std::endl;
+//    std::cout << output_str << std::endl;
  }
  return 0;
 }