diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index b8f131634e9eb4c56218db8f0643f10834089393..0f9f96dc65fcfd892a5ca99a7c36a71ebca83817 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -32,7 +32,6 @@ void format_image(framework::Tensor *image_tensor) {
   float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
 
   image::format_image(&p_data, channel, height, width);
-
   if (p_data != data_ptr && external_ptr == nullptr) {
     image_tensor->reset_data_ptr(p_data);
   }
@@ -61,6 +60,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
   memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
   ofm_tensor->set_type(typeid(half));
+  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
 }
 
 void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
@@ -79,7 +79,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
   memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
   ofm_tensor->set_type(typeid(half));
+  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
 }
+
 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
   size_t memory_size = 0;
@@ -96,6 +98,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
   memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
   ofm_tensor->set_type(typeid(float));
+  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
 }
 
 float filter_find_max(framework::Tensor *filter_tensor) {
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index b9e7acfdaf3f1b70a8484d7426505da9c27b34a4..637521ea69e6301b6242d492eacc8dcf38091bfb 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -475,6 +475,19 @@ void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
   }
 }
 
+template <typename Device, typename T>
+void Executor<Device, T>::FeedData(const vector<framework::Tensor> &v) {
+  auto input_size = v.size();
+  auto vars = program_.scope->VarContain("feed");
+  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
+                        "input data number not correct");
+  for (int i = 0; i < input_size; i++) {
+    auto var = program_.scope->Var("feed", i);
+    auto feed_tensor = var->template GetMutable<LoDTensor>();
+    feed_tensor->ShareDataWith(v[i]);
+  }
+}
+
 template <typename Device, typename T>
 void Executor<Device, T>::GetResults(std::vector<void *> *v) {
   auto output_size = v->size();
@@ -489,6 +502,20 @@ void Executor<Device, T>::GetResults(std::vector<void *> *v) {
   }
 }
 
+template <typename Device, typename T>
+void Executor<Device, T>::GetResults(std::vector<framework::Tensor *> *v) {
+  auto output_size = v->size();
+  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
+  auto vars = program_.scope->VarContain("fetch");
+  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
+                        "output data number not correct");
+  for (int i = 0; i < output_size; i++) {
+    auto var = program_.scope->Var("fetch", i);
+    auto fetch_tensor = var->template GetMutable<LoDTensor>();
+    (*v)[i] = fetch_tensor;
+  }
+}
+
 template <typename Device, typename T>
 std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
   auto &ops = ops_of_block_[0];
diff --git a/src/framework/executor.h b/src/framework/executor.h
index ee285acac3e8bdf500452b6494bb37d79a2089e4..ba1a8b1afef3a9f592c4f84301576f187bc3c001 100644
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -53,7 +53,11 @@ class Executor {
   void InjectVariable(const Tensor &t, std::string var_name);
   void FeedData(const Tensor &t);
   void FeedData(const std::vector<void *> &v);
+  void FeedData(const std::vector<framework::Tensor> &v);
+
   void GetResults(std::vector<void *> *v);
+  void GetResults(std::vector<framework::Tensor *> *v);
+
   std::shared_ptr<Tensor> FetchResult(int id = -1);
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index c684169ce21474b4c68de9db523035866859818a..8b633ec5cca6719dc3b1ebf5637ca8796e90046f 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -31,6 +31,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
 
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
 class LoDTensor;
 
 class Tensor : public TensorBase {
@@ -223,6 +228,8 @@ class Tensor : public TensorBase {
 
   float scale[2];                 // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
   void *external_data = nullptr;  // only used for Feed
+  LayoutType layout = LAYOUT_HWC;
+  int64_t fpga_data_num;
 #endif
 };
 
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index ebeabead13a69ae1690335b4a73a9a511e086192..e2c2e6ffbfbe140d95d24684eb57227cc9503e78 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -113,72 +113,53 @@ bool PaddleMobilePredictor<Device, T>::Run(
 }
 
 #ifdef PADDLE_MOBILE_FPGA
-template <typename Device, typename T>
-bool PaddleMobilePredictor<Device, T>::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data, std::vector<int> *index_data,
-    int batch_size) {
-  if (inputs.empty()) {
-    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-  auto input = inputs[0];
-
-  if (input.shape.size() != 4) {
-    LOG(kLOG_ERROR) << "input shape not equal to 4!";
-    return false;
-  }
-  std::vector<int64_t> dims;
-  for (auto d : input.shape) {
-    dims.push_back(static_cast<int64_t>(d));
-  }
-
-  // use tensor
-  framework::DDim ddim =
-      framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
+void ConvertPaddleTensors(const PaddleTensor &src, framework::Tensor *des) {
+  des->Resize(framework::make_ddim(src.shape));
+  des->external_data = src.data.data();
+  des->set_type(src.dtypeid);
+  des->layout =
+      src.layout == LAYOUT_HWC ? framework::LAYOUT_HWC : framework::LAYOUT_CHW;
+}
 
-  framework::Tensor input_tensor;
-  input_tensor.Resize(ddim);
-  int input_length = framework::product(ddim);
-  auto input_ptr = input_tensor.mutable_data<T>();
+void ConvertTensors(const framework::Tensor &src, PaddleTensor *des) {
+  des->shape = framework::vectorize2int(src.dims());
+  des->dtypeid = src.type();
+  des->layout = src.layout == framework::LAYOUT_HWC ? LAYOUT_HWC : LAYOUT_CHW;
 
-  memcpy(input_ptr, static_cast<T *>(input.data.data()),
-         input_length * sizeof(T));
-  paddle_mobile_->Predict(input_tensor);
-  auto num_result = index_data->size();
-  if (output_data->size() != num_result) {
-    LOG(kLOG_ERROR) << "index and output number don't match";
-    return false;
+  auto num = src.numel();
+  if (src.type() == typeid(float)) {
+    des->data.Reset(const_cast<float *>(src.data<float>()),
+                    num * sizeof(float));
+  } else {
+    des->data.Reset(const_cast<int16_t *>(src.data<int16_t>()),
+                    num * sizeof(int16_t));
   }
+}
 
-  for (int i = 0; i < num_result; i++) {
-    auto output_tensor = paddle_mobile_->FetchResult((*index_data)[i]);
-
-    if (output_data->empty()) {
-      LOG(kLOG_ERROR)
-          << "At least one output should be set with tensors' names.";
-      return false;
-    }
-
-    auto &output = (*output_data)[i];
-    int output_length = output_tensor->numel();
-    std::vector<int64_t> tensor_shape =
-        framework::vectorize(output_tensor->dims());
-
-    for (auto d : tensor_shape) {
-      output.shape.push_back(static_cast<int>(d));
-    }
-
-    if (output.data.length() < output_length * sizeof(T)) {
-      output.data.Resize(output_length * sizeof(T));
-    }
-
-    memcpy(output.data.data(), output_tensor->template data<T>(),
-           output_length * sizeof(T));
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
+    const std::vector<PaddleTensor> &inputs) {
+  auto num = inputs.size();
+  std::vector<framework::Tensor> tensors(num, framework::Tensor());
+  for (int i = 0; i < num; i++) {
+    tensors[i].init(typeid(float));
+    ConvertPaddleTensors(inputs[i], &tensors[i]);
   }
+  paddle_mobile_->FeedData(tensors);
+}
 
-  return true;
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(
+    std::vector<PaddleTensor> *outputs) {
+  auto num = outputs->size();
+  PADDLE_MOBILE_ENFORCE(num > 0, "0 output pointers is not permitted");
+  std::vector<framework::Tensor *> tensors(num, nullptr);
+  paddle_mobile_->GetResults(&tensors);
+  for (int i = 0; i < num; i++) {
+    ConvertTensors(*tensors[i], &(*outputs)[i]);
+  }
 }
+
 template <typename Device, typename T>
 void PaddleMobilePredictor<Device, T>::FeedData(
     const std::vector<void *> &inputs) {
diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h
index 0cadd71c226b20331c8399d2cfd8873c093a6b84..4ea83123355ac4dfff0479045eef7f1c0a4734d2 100644
--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -32,13 +32,13 @@ class PaddleMobilePredictor : public PaddlePredictor {
            std::vector<PaddleTensor>* output_data,
            int batch_size = -1) override;
 #ifdef PADDLE_MOBILE_FPGA
-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data, std::vector<int>* index_data,
-           int batch_size = -1) override;
   void FeedData(const std::vector<void*>& inputs) override;
   void GetResults(std::vector<void*>* outputs) override;
-  void Predict_From_To(int start = 0, int end = -1) override;
+  void Predict_From_To(int start, int end) override;
+  void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) override;
+  void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
 #endif
+
   ~PaddleMobilePredictor() override;
 
  private:
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index 42509915d13cf7e632ed20c73f1320ec8bac09d1..f7e66740f0b5f732e7517db527ad60dd660d6807 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <cassert>
 #include <memory>
 #include <string>
+#include <typeindex>
 #include <vector>
 
 // #define PADDLE_MOBILE_FPGA
@@ -33,12 +34,21 @@ namespace paddle_mobile {
 #ifdef PADDLE_MOBILE_FPGA
 namespace fpga {
 int open_device();
-}
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+}  // namespace fpga
 #endif
 
 enum PaddleDType {
   FLOAT32,
+  FLOAT16,
   INT64,
+  INT8,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
 };
 
 class PaddleBuf {
@@ -78,6 +88,8 @@ struct PaddleTensor {
   // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::type_index dtypeid = typeid(float);
+  LayoutType layout;
 };
 
 enum class PaddleEngineKind {
@@ -116,12 +128,11 @@ class PaddlePredictor {
     std::string param_file;
   };
 #ifdef PADDLE_MOBILE_FPGA
-  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data,
-                   std::vector<int>* index_data, int batch_size = -1) = 0;
   virtual void FeedData(const std::vector<void*>& inputs) = 0;
   virtual void GetResults(std::vector<void*>* outputs) = 0;
-  virtual void Predict_From_To(int start = 0, int end = -1) = 0;
+  virtual void Predict_From_To(int start, int end) = 0;
+  virtual void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) = 0;
+  virtual void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) = 0;
 #endif
 
  protected:
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 0dfa9d0500847c80e78a156b9c82a33d1dfd4a00..687185e82a44806783535e084cf34e90ca09882d 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -231,11 +231,23 @@ template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
   executor_->FeedData(v);
 };
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::FeedData(
+    const std::vector<framework::Tensor> &v) {
+  executor_->FeedData(v);
+};
+
 template <typename Device, typename T>
 void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
   executor_->GetResults(v);
 }
 
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::GetResults(std::vector<framework::Tensor *> *v) {
+  executor_->GetResults(v);
+}
+
 template <typename Device, typename T>
 std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
     int id) {
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index d608abcac79d2a5ae79ad375a8cb93d4594d1e8a..1aa0efd6beaadaa461643610023ae10a3543604f 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -91,7 +91,11 @@ class PaddleMobile {
   void InjectVariable(const framework::Tensor &t, std::string var_name);
   void FeedData(const framework::Tensor &t);
   void FeedData(const std::vector<void *> &v);
+  void FeedData(const std::vector<framework::Tensor> &v);
+
   void GetResults(std::vector<void *> *v);
+  void GetResults(std::vector<framework::Tensor *> *v);
+
   std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3af55f075805361fd0cff40ab2e53752ea63f781..138362f20892cb1b5db9bf0a2c83baec79f5f0f4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -80,6 +80,9 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-marker paddle-mobile)
 
+    ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp)
+    target_link_libraries(test-rfcn-api paddle-mobile)
+
 
     set(FOUND_MATCH ON)
 endif ()
diff --git a/test/fpga/test_rfcn_api.cpp b/test/fpga/test_rfcn_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5cd910080d8f45b40806a10ef2f50b2a6f1219bc
--- /dev/null
+++ b/test/fpga/test_rfcn_api.cpp
@@ -0,0 +1,135 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <fstream>
+#include "../../src/io/paddle_inference_api.h"
+
+using namespace paddle_mobile;
+using namespace paddle_mobile::fpga;
+
+static const char *g_image = "../models/rfcn/data.bin";
+static const char *g_model = "../models/rfcn/model";
+static const char *g_param = "../models/rfcn/params";
+
+void readStream(std::string filename, char *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in | std::ios::binary);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+
+  in.seekg(0, std::ios::end);  // go to the end
+  auto length = in.tellg();    // report location (this is the length)
+  in.seekg(0, std::ios::beg);  // go back to the beginning
+  in.read(buf, length);
+  in.close();
+}
+
+PaddleMobileConfig GetConfig() {
+    PaddleMobileConfig config;
+    config.precision = PaddleMobileConfig::FP32;
+    config.device = PaddleMobileConfig::kFPGA;
+    config.prog_file = g_model;
+    config.param_file = g_param;
+    config.thread_num = 1;
+    config.batch_size = 1;
+    config.optimize = true;
+	  config.lod_mode = true;
+    config.quantification = false;
+    return config;
+}
+
+int main() {
+    open_device();
+    PaddleMobileConfig config = GetConfig();
+    auto predictor =
+            CreatePaddlePredictor<PaddleMobileConfig,
+                    PaddleEngineKind::kPaddleMobile>(config);
+	
+    std::cout << "after loading model" << std::endl;
+    
+    float img_info[3] = {768, 1536, 768.0f / 960.0f};
+    int img_length = 768 * 1536 * 3;
+    auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
+    readStream(g_image, reinterpret_cast<char *>(img));
+
+    std::cout << "after initializing data" << std::endl;
+/*
+  predictor->FeedData({img_info, img});
+  predictor->Predict_From_To(0, -1);
+  std::cout << " Finishing predicting " << std::endl;
+	std::vector<void *> v(3, nullptr);
+	predictor->GetResults(&v);
+  int post_nms = 300;
+  for (int num = 0; num < post_nms; num ++){
+    for (int i = 0; i < 8; i ++){
+      std:: cout << ((float*)(v[0]))[num * 8 + i] << std::endl;
+    }
+  }
+  for (int num = 0; num < post_nms; num ++){
+    for (int i = 0; i < 8; i ++){
+      std:: cout << ((float*)(v[1]))[num * 8 + i] << std::endl;
+    }
+  }
+  for (int num = 0; num < post_nms; num ++){
+    for (int i = 0; i < 4; i ++){
+      std:: cout << ((float*)(v[2]))[num * 4 + i] << std::endl;
+    }
+  }
+*/
+
+  struct PaddleTensor t_img_info, t_img;
+  t_img_info.dtype = FLOAT32;
+  t_img_info.layout = LAYOUT_HWC;
+  t_img_info.shape = std::vector<int>({1,3});
+  t_img_info.name = "Image information";
+  t_img_info.data.Reset(img_info, 3 * sizeof(float));
+
+  t_img.dtype = FLOAT32;
+  t_img.layout = LAYOUT_HWC;
+  t_img.shape = std::vector<int>({1,768, 1536, 3});
+  t_img.name = "Image information";
+  t_img.data.Reset(img, img_length * sizeof(float));
+  predictor->FeedPaddleTensors({t_img_info, t_img});
+
+  std::cout << "Finishing feeding data " << std::endl;
+
+  predictor->Predict_From_To(0, -1);
+  std::cout << "Finishing predicting " << std::endl;
+
+  std::vector<PaddleTensor> v(3, PaddleTensor());
+  predictor->FetchPaddleTensors(&v);
+	auto post_nms = v[0].data.length()/sizeof(float)/8;
+  for (int num = 0; num < post_nms; num ++){
+    for (int i = 0; i < 8; i ++){
+      auto p = reinterpret_cast<float*>(v[0].data.data());
+      std:: cout << p[num * 8 + i] << std::endl;
+    }
+  }
+  for (int num = 0; num < post_nms; num ++){
+    for (int i = 0; i < 8; i ++){
+      auto p = reinterpret_cast<float*>(v[1].data.data());
+      std:: cout << p[num * 8 + i] << std::endl;
+    }
+  }
+  for (int num = 0; num < post_nms; num ++){
+    for (int i = 0; i < 4; i ++){
+      auto p = reinterpret_cast<float*>(v[2].data.data());
+      std:: cout << p[num * 4 + i] << std::endl;
+    }
+  }
+    return 0;
+}