diff --git a/src/common/log.h b/src/common/log.h
index d574818f865ab6b2af748a5b3162b589f396a564..282ee2780993447051143866f65907ba7ce17be3 100644
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -31,7 +31,8 @@ namespace paddle_mobile {
 
 #ifdef ANDROID
 
-extern const char *ANDROID_LOG_TAG;
+static const char *ANDROID_LOG_TAG =
+    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
 
 #define ANDROIDLOGI(...)                                               \
   __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index e4ffdaf05d5dad129138b2a7745619c86e8ca805..b5fab192aaed8ecb7796fc81b2ac67d810654c4c 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -531,20 +531,6 @@ void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
   }
 }
 
-template <typename Device, typename T>
-void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
-  auto input_size = v.size();
-  int index = 0;
-  auto vars = program_.scope->VarContain("feed", &index);
-  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
-                        "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    auto var = program_.scope->Var("feed", i + index);
-    auto feed_tensor = var->template GetMutable<LoDTensor>();
-    feed_tensor->ShareDataWith(v[i]);
-  }
-}
-
 template <typename Device, typename T>
 void Executor<Device, T>::GetResults(std::vector<void *> *v) {
   auto output_size = v->size();
diff --git a/src/framework/executor.h b/src/framework/executor.h
index ea7bde7f748352c9b1221e69f3359938b7371a39..853914c54cb962c570ae2a9751500d3275091499 100644
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -53,7 +53,6 @@ class Executor {
   void InjectVariable(const Tensor &t, std::string var_name);
   void FeedData(const Tensor &t);
   void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
 
   void GetResults(std::vector<void *> *v);
   void GetTensorResults(std::vector<framework::Tensor *> *v);
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index 1f4769b282385207a5b53d6d678364393d7da6cc..5839a279cdfc03472628cf7650b30064281a226e 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -146,7 +146,7 @@ void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
     tensors[i].init(typeid(float));
     ConvertPaddleTensors(inputs[i], &tensors[i]);
   }
-  paddle_mobile_->FeedTensorData(tensors);
+  // paddle_mobile_->FeedTensorData(tensors);
 }
 
 template <typename Device, typename T>
diff --git a/src/io/jni/paddle_mobile_jni.cpp b/src/io/jni/paddle_mobile_jni.cpp
index 12c0a6cbca1721578efe175d8c108e30de18be7d..63511a2226e9563e758f87fea4fed67438eda8f6 100644
--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -39,8 +39,6 @@ using framework::Tensor;
 using paddle_mobile::CPU;
 using std::string;
 
-const char *ANDROID_LOG_TAG =
-    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
 paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 static std::mutex shared_mutex;
 
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index e3fd9f40f4194ed2841ba11366c6c5142e6279ab..7983541a221fb63f573dfa8186599934cd97387b 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -91,7 +91,6 @@ class PaddleMobile {
   void InjectVariable(const framework::Tensor &t, std::string var_name);
   void FeedData(const framework::Tensor &t);
   void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
 
   void GetResults(std::vector<void *> *v);
   void GetTensorResults(std::vector<framework::Tensor *> *v);
diff --git a/src/operators/fusion_deconv_add_bn_op.h b/src/operators/fusion_deconv_add_bn_op.h
index f7f9b9e2094a7228c944b70b88ae3105ae9f37e8..618545044136e42e750fd4c71ce96bd861954b71 100644
--- a/src/operators/fusion_deconv_add_bn_op.h
+++ b/src/operators/fusion_deconv_add_bn_op.h
@@ -57,7 +57,7 @@ class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
   FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
       : framework::OperatorWithKernel<
             DeviceType, FusionDeconvAddBNParam<DeviceType>,
             operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
diff --git a/src/operators/fusion_deconv_add_bn_relu_op.h b/src/operators/fusion_deconv_add_bn_relu_op.h
index 97070ef01e544839be8eab6ddba21c43dfa9a26e..1c6cfd7318e48cad16e1d274b5724c832c70d8c8 100644
--- a/src/operators/fusion_deconv_add_bn_relu_op.h
+++ b/src/operators/fusion_deconv_add_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDeconvAddBNReluOp
   FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                           const VariableNameMap &outputs,
                           const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
+                          framework::Scope *scope)
       : framework::OperatorWithKernel<
             DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
             operators::DeconvAddBNReluKernel<DeviceType, T>>(
diff --git a/src/operators/fusion_deconv_bn_relu_op.h b/src/operators/fusion_deconv_bn_relu_op.h
index ad0920ebd69b1a13ebc0e85f2c5f6008379715da..92bb97445d1442056843efb1fd66fa3fb1e54511 100644
--- a/src/operators/fusion_deconv_bn_relu_op.h
+++ b/src/operators/fusion_deconv_bn_relu_op.h
@@ -56,7 +56,7 @@ class FusionDeconvBNReluOp
   FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
       : framework::OperatorWithKernel<
             DeviceType, FusionDeconvBNReluParam<DeviceType>,
             operators::DeconvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
index c34bd1f5d909317fe727a192f5dc27479f71bc90..2c3166720652a77d3b628d2e5fd5d227a1a7fc33 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -47,6 +47,7 @@ bool IsExpand(const std::vector<int64_t> &filter_dim,
   return !(filter_1 && strides_1 && padding_0 && dilation_1);
 }
 
+#ifdef PADDLE_MOBILE_CPU
 template <typename Itype, typename Otype>
 void GemmConv(const ConvParam<CPU> &param) {
   const Tensor *input = param.Input();
@@ -241,6 +242,7 @@ template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
 template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
 template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
 #endif
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V1/conv_kernel.cpp b/src/operators/kernel/fpga/V1/conv_kernel.cpp
index 73722820bd90b54abd64dd01b157c74c6a1069e8..57b5eb754e327160399bee728d0689101fac1134 100644
--- a/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_kernel.cpp
@@ -24,8 +24,8 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
   int channel = out->dims()[1];
   auto bs_ptr =
diff --git a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
index 788504df5d2ea1005cfaa76f12b58e61c0218391..1597885e43e01895b6acd425031341af70d5eaf7 100644
--- a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
@@ -27,10 +27,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   // const Tensor *bias = param->Bias();
   // auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
index 4239ac1e5da421cb0e2421a8919d8d15e40348af..a8205df3c9c1052055ba15ca58fd215f1d49ba0e 100644
--- a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
@@ -27,10 +27,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->InputBias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
index 28b8c83198a5517ed0dc9732e0033030a876a7da..b27f5cf870d2e3220bec31ee63bb27361cb2c8cf 100755
--- a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
@@ -28,10 +28,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->InputBias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
index f166587109e5f63e30203a940aa3baa8ae87f844..75597f0ecd570b6b21894a2f9a0ff0ad91a54ea4 100644
--- a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
@@ -29,10 +29,10 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->InputBias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index 2aea5a770c674a7d70dc2abf0d691598444f9a25..b128c8e3430b8a359a5ad9dbcba397ad0f2b6568 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -57,13 +57,9 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
 }
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<Tensor *>(param.InputX());
-  if (input->type() == typeid(float)) {
-    int col = param.Col();
-    auto output = &(param.Out()->at(col));
-    output->ShareDataWith(*input);
-    return;
-  }
+  auto input = const_cast<LoDTensor *>(param.InputX());
+  int col = param.Col();
+  LoDTensor *out = &param.Out()->at(col);
 
   fpga::BypassArgs args = param.fpga_bypass_args;
   auto input_address = (input->data<half>());
@@ -71,7 +67,7 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   float *outdata_ptr =
       reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
   const int num_th = 32;
-  if ((param.Out()->fpga_data_num) < num_th) {
+  if ((out->fpga_data_num) < num_th) {
     fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
 
     for (int idx = 0; idx < product(input->dims()); ++idx) {
@@ -81,14 +77,14 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   }
 
   fpga::PerformBypass(args);
-  auto outC = param.Out()->dims()[1];
-  auto outH = param.Out()->dims()[2];
-  auto outW = param.Out()->dims()[3];
+  auto outC = out->dims()[1];
+  auto outH = out->dims()[2];
+  auto outW = out->dims()[3];
 
   fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        param.Out()->fpga_data_num * sizeof(float));
+                        out->fpga_data_num * sizeof(float));
 
-  if (param.Out()->fpga_data_num != product(input->dims())) {
+  if (out->fpga_data_num != product(input->dims())) {
     float *data_tmp =
         reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
     dealign(outdata_ptr, data_tmp, outC, outH, outW);
diff --git a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
index 6fbeb63fe606aac014f76088210c74a4118e6c78..fef370515e9e9ffa1d90c184e62919235533b8a5 100644
--- a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
@@ -25,7 +25,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
   auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   auto out = param->Out();
diff --git a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
index f47a585ee412316ce65084c5fa10a622ffb93a4f..5d81f71c3608d19f5be5c46699b8379ebb279982 100644
--- a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <>
-bool Pad2dKernel<FPGA, float>::Init(Pad2dParam<FPGA> *param) {
-  Tensor *output = param->Out();
+bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
+  Tensor *output = param->output_;
   fpga::format_fp16_ofm(output);
   return true;
 }
@@ -39,9 +39,9 @@ void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
   }
 }
 template <>
-void Pad2dKernel<FPGA, float>::Compute(const Pad2dParam<FPGA> &param) {
-  auto in_x = param.InputX();
-  auto out = param.Out();
+void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
+  auto in_x = param.input_;
+  auto out = param.output_;
   fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
                         in_x->numel() * sizeof(half));
   pad2dFunc(in_x, out);
diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp
index 4c0e09e63f2785b535f81b5262afe93099a74aa5..994fa151621956aa791d36cc0f4cd829dc88f3d1 100644
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -68,7 +68,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
 
 template <>
 void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
-  auto *input = const_cast<Tensor *>(param.Input());
+  auto *input = const_cast<LoDTensor *>(param.Input());
 
   if (input->type() == typeid(float)) {
     auto *output = param.Output();
diff --git a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
index bf36873a1fb442a4d5ff6f57056515009d275cd6..bb9eb3d6e8acf3d59ce3c4541f8c553fe7cb1cc2 100644
--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
@@ -24,7 +24,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::SIGMOID;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->InputX());
+  auto input = const_cast<LoDTensor *>(param->InputX());
   auto input_ptr = input->data<half>();
   auto out = param->Out();
   fpga::format_fp16_ofm(out);
diff --git a/src/operators/math/channel_wise.h b/src/operators/math/channel_wise.h
index 796ea6d2b97d31d3091b225601065ee4670316e8..e4c0cbe05bfabde42df7f33a71882aa8ec08c477 100644
--- a/src/operators/math/channel_wise.h
+++ b/src/operators/math/channel_wise.h
@@ -33,7 +33,7 @@ void AddChannelWise(const framework::Tensor *input,
   // maybe check shape
   int batch_size = input->dims()[0];
   int channels = input->dims()[1];
-  size_t spatial_size = input->dims()[2] * input->dims()[3];
+  int spatial_size = input->dims()[2] * input->dims()[3];
 
   for (int batch = 0; batch < batch_size; ++batch) {
     for (int channel = 0; channel < channels; ++channel) {
@@ -88,7 +88,7 @@ void ScaleAddChannelWise(const framework::Tensor *input,
   // maybe check shape
   int batch_size = input->dims()[0];
   int channels = input->dims()[1];
-  size_t spatial_size = input->dims()[2] * input->dims()[3];
+  int spatial_size = input->dims()[2] * input->dims()[3];
 
   for (int batch = 0; batch < batch_size; ++batch) {
     for (int channel = 0; channel < channels; ++channel) {
diff --git a/src/operators/math/gemm/cblas.cc b/src/operators/math/gemm/cblas.cc
index 6dc04d1b4e1ec3b8247713c2060bbd767c76d2e2..adc375b62913f0ad1105080f8c26b547e96671f3 100644
--- a/src/operators/math/gemm/cblas.cc
+++ b/src/operators/math/gemm/cblas.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
 #pragma once
 
 #include "operators/math/gemm/cblas.h"
@@ -47,3 +49,5 @@ void cblas_sgemv(const bool trans, const int M, const int N, const float alpha,
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/pad2d_op.cpp b/src/operators/pad2d_op.cpp
index 3d0fdf44d5b3a97298e1bc2809794e23e3a9e908..8a771c36a50f5a1b458df38d73ed93be61859cd4 100644
--- a/src/operators/pad2d_op.cpp
+++ b/src/operators/pad2d_op.cpp
@@ -37,5 +37,8 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp);
+#endif
 
 #endif  // PAD2D_OP
diff --git a/test/fpga/test_rfcn_api.cpp b/test/fpga/test_rfcn_api.cpp
index 724ef7d14d1189bb68cb5db1a583850dd1e72816..f787d8f9acfe85ead101aeb16a4fbebe1aefee65 100644
--- a/test/fpga/test_rfcn_api.cpp
+++ b/test/fpga/test_rfcn_api.cpp
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
 #include <iostream>
-#include "io/paddle_inference_api.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
 
-static const char *g_image = "../models/rfcn/data.bin";
-static const char *g_model = "../models/rfcn/model";
-static const char *g_param = "../models/rfcn/params";
+#include <string>
 
 void readStream(std::string filename, char *buf) {
   std::ifstream in;
@@ -35,137 +37,116 @@ void readStream(std::string filename, char *buf) {
   auto length = in.tellg();    // report location (this is the length)
   in.seekg(0, std::ios::beg);  // go back to the beginning
   in.read(buf, length);
+  DLOG << length;
   in.close();
 }
 
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-PaddleMobileConfig GetConfig1() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.model_dir = "../models/resnet50";
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.quantification = false;
-  return config;
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+                    int num, int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
+  }
 }
 
-int main() {
-  open_device();
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img.dtypeid = typeid(float);
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = typeid(float);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  std::cout << "out[0] length " << v[0].data.length() << std::endl;
-  std::cout << "out[1] length " << v[1].data.length() << std::endl;
-  std::cout << "out[2] length " << v[2].data.length() << std::endl;
-
-  auto post_nms = v[0].data.length() / sizeof(float) / 8;
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[0].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum, bool use_chw) {
+  // bool use_chw = true;
+  if (input_tensor.dims().size() != 4) return;
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  int n = (input_tensor.dims())[0];
+  auto data_ptr = input_tensor.get_data();
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
+  auto data_tmp = data_ptr_16;
+  if (use_chw) {
+    data_tmp =
+        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
+    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
   }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[1].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
   }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 4; i++) {
-      auto p = reinterpret_cast<float *>(v[2].data.data());
-      std::cout << p[num * 4 + i] << std::endl;
-    }
+  out.close();
+  if (data_tmp != data_ptr_16) {
+    free(data_tmp);
   }
-  std::cout << "Finish getting vector values" << std::endl;
-
-  ////////////////////////////////////////////////////
+}
 
-  PaddleTensor tensor;
-  predictor->GetPaddleTensor("fetch2", &tensor);
-  for (int i = 0; i < post_nms; i++) {
-    auto p = reinterpret_cast<float *>(tensor.data.data());
-    std::cout << p[+i] << std::endl;
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
   }
+  out.close();
+}
 
-  //////////////////////////////////////////////////////
-
-  PaddleMobileConfig config1 = GetConfig1();
-  auto predictor1 =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config1);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  int img_length1 = 224 * 224 * 3;
-  auto img1 =
-      reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
-
-  std::cout << "Finishing initializing data" << std::endl;
+void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
+                 bool use_chw) {
+  static int i = 0;
+  if (input_tensor.numel() == 0) {
+    return;
+  }
+  if (input_tensor.type() == typeid(float)) {
+    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
 
-  struct PaddleTensor t_img1;
+    dump_stride_float(filename, input_tensor, dumpnum);
+  } else {
+    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
 
-  t_img1.dtypeid = typeid(float);
-  t_img1.layout = LAYOUT_HWC;
-  t_img1.shape = std::vector<int>({1, 224, 224, 3});
-  t_img1.name = "Image information";
-  t_img1.data.Reset(img1, img_length1 * sizeof(float));
-  predictor1->FeedPaddleTensors({t_img1});
-  predictor1->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
+    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
+  }
+  DLOG << "dump input address: " << input_tensor.get_data();
+}
 
-  std::vector<PaddleTensor> v1;         // No need to initialize v
-  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
-  std::cout << "Output number is " << v1.size() << std::endl;
-  std::cout << "out[0] length " << v1[0].data.length() << std::endl;
+static const char *g_rfcn_combine = "../models/rfcn";
+static const char *g_image_src_float = "../models/rfcn/data.bin";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+
+  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
+                         std::string(g_rfcn_combine) + "/params", true, false,
+                         1, true)) {
+    float img_info[3] = {768, 1536, 768.0f / 960.0f};
+    auto img = reinterpret_cast<float *>(
+        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
+    readStream(g_image_src_float, reinterpret_cast<char *>(img));
+
+    std::vector<void *> v(3, nullptr);
+    paddle_mobile.FeedData(std::vector<void *>({img_info, img}));
+    paddle_mobile.Predict_To(-1);
+
+    for (int i = 65; i < 69; i++) {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "rfcn_" + std::to_string(i);
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
+                                           tensor_ptr->numel() * sizeof(float));
+      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true);
+    }
+    //   paddle_mobile.GetResults(&v);
+    DLOG << "Computation done";
+    fpga::fpga_free(img);
+  }
 
   return 0;
 }
diff --git a/test/net/test_benchmark.cpp b/test/net/test_benchmark.cpp
index 31a0850c4d531d13f7960d9857b3721ee69c6d27..38e6f8e8701d28949331d03f2d8598c5ac46086c 100644
--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
@@ -36,7 +36,10 @@ int main(int argc, char* argv[]) {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   paddle_mobile.SetThreadNum(thread_num);
   auto time1 = time();
-  if (paddle_mobile.Load(fluid_model, optimize)) {
+//  if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) {
+  if (paddle_mobile.Load(std::string(fluid_model) + "/model",
+                                 std::string(fluid_model) + "/params", optimize,
+                                 false, 1, true)) {
     auto time2 = time();
     std::cout << "load cost :" << time_diff(time1, time2) << "ms\n";
     paddle_mobile::framework::Tensor input;
@@ -51,14 +54,15 @@ int main(int argc, char* argv[]) {
     paddle_mobile::framework::DDim in_shape =
         paddle_mobile::framework::make_ddim(dims);
     SetupTensor<float>(&input, in_shape, 0.f, 255.f);
-    // warmup
-    for (int i = 0; i < 10; ++i) {
+//    // warmup
+    for (int i = 0; i < 2; ++i) {
       paddle_mobile.Predict(input);
     }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       paddle_mobile.Predict(input);
     }
+
     auto time4 = time();
     std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n";
     std::ostringstream os("output tensor size: ");
@@ -68,7 +72,7 @@ int main(int argc, char* argv[]) {
       os << ", " << output->data<float>()[i];
     }
     std::string output_str = os.str();
-    std::cout << output_str << std::endl;
+//    std::cout << output_str << std::endl;
   }
   return 0;
 }