Merge pull request #977 from zhangyang0701/develop

implement predict_from_to for FPGA track close #976

Merge pull request #977 from zhangyang0701/develop
implement predict_from_to for FPGA track close #976
2cada5ed · zhangyang0701 · GitHub · 4d8a07c7 · 041a31a2 · 2cada5ed
6 changed file
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -93,6 +93,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
 #endif
    }
+    DLOG << "Total " << ops.size() << " ops have been created ";
  }
  if (program_.combined) {
    InitCombineMemory();
@@ -643,6 +644,75 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
  return result_vector;
 }

+#ifdef PADDLE_MOBILE_FPGA
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                        string var_name) {
+  framework::Variable *g_feed_value = program_.scope->Var(var_name);
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
+  InjectVariable(t, "feed");
+};
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult() {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  auto last_op = ops.rbegin();
+  auto output_map = (*last_op)->Outputs();
+  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output");
+  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
+      out_keys[0], output_map, *(program_.scope));
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From_To(int start, int end) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  end = end < 0 ? (int)ops.size() : end;
+  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
+                        "start or end parameter is wrong");
+
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+  for (int i = start; i < end; i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    ops[i]->Run();
+
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From(int start) {
+  Predict_From_To(start);
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_To(int end) {
+  Predict_From_To(0, end);
+};
+#endif
+
 template class Executor<CPU, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include <thread>
 #include "common/dep_core.h"
 #endif
+using std::string;

 namespace paddle_mobile {

@@ -92,6 +93,17 @@ class Executor {
  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
                      framework::Variable *var,
                      framework::LoDTensor *tensor) const;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ public:
+  void InjectVariable(const framework::Tensor &t, string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult();
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
 };

 }  // namespace paddle_mobile
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -124,6 +124,40 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
  loader_ = nullptr;
 }

+#ifdef PADDLE_MOBILE_FPGA
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                            string var_name) {
+  executor_->InjectVariable(t, var_name);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
+  executor_->FeedData(t);
+};
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult() {
+  return executor_->FetchResult();
+};
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
+  executor_->Predict_From_To(start, end);
+};
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_From(int start) {
+  executor_->Predict_From(start);
+};
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_To(int end) {
+  executor_->Predict_To(end);
+};
+#endif
+
 template class PaddleMobile<CPU, Precision::FP32>;

 template class PaddleMobile<FPGA, Precision::FP32>;

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -92,6 +92,16 @@ class PaddleMobile {
 private:
  std::shared_ptr<Loader<Dtype, P>> loader_;
  std::shared_ptr<Executor<Dtype, P>> executor_;
+
+#ifdef PADDLE_MOBILE_FPGA
+ public:
+  void InjectVariable(const framework::Tensor &t, string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult();
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
 };

 }  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -39,7 +39,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
  poolArgs.image.pad_width = (uint32_t)paddings[1];
  poolArgs.image.scale_address = input->scale;
  poolArgs.output.address = output_ptr;
-  poolArgs.output.scale_address = input->scale;
+  poolArgs.output.scale_address = output->scale;
  poolArgs.kernel.height = (uint32_t)ksize[0];
  poolArgs.kernel.width = (uint32_t)ksize[1];
  poolArgs.kernel.stride_h = (uint32_t)strides[0];

--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -36,7 +36,8 @@ int main() {

    std::vector<float> input(input_tensor.data<float>(),
                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热十次
+#ifndef PADDLE_MOBILE_FPGA
+    //   预热十次
    for (int i = 0; i < 10; ++i) {
      paddle_mobile.Predict(input, dims);
    }
@@ -47,7 +48,17 @@ int main() {
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
              << std::endl;
-  }

+#else
+    auto time3 = time();
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(10);
+    paddle_mobile.Predict_From(10);
+    paddle_mobile.FetchResult();
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+#endif
+  }
  return 0;
 }