Merge remote-tracking branch 'upstream/develop' into develop

489e06d1 · xiebaiyuan · ed5a5afd · f100efc9 · 489e06d1 · 489e06d1
8 changed file
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -17,17 +17,21 @@ limitations under the License. */
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
+#include <map>
 #include "bias_scale.h"
 #include "filter.h"
 #include "image.h"
 #define FPGA_TEST_MODE
-//#define PADDLE_MOBILE_OS_LINUX
+#define PADDLE_MOBILE_OS_LINUX
 namespace paddle_mobile {
 namespace fpga {
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
+#ifdef PADDLE_MOBILE_OS_LINUX
+static std::map<void *, size_t> memory_map;
+#endif
 static inline int do_ioctl(int req, const void *arg) {
 #ifdef PADDLE_MOBILE_OS_LINUX
@@ -48,10 +52,13 @@ int open_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
-  DLOG << size << " bytes allocated";
+  static uint64_t counter = 0;
+  counter += size;
+  DLOG << size << " bytes allocated. Total " << counter << " bytes";
 #ifdef PADDLE_MOBILE_OS_LINUX
-  return reinterpret_cast<void *>(
+  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+  memory_map.insert(std::make_pair(ptr, size));
+  return ptr;
 #else
  return malloc(size);
 #endif
@@ -59,7 +66,16 @@ void *fpga_malloc(size_t size) {
 void fpga_free(void *ptr) {
 #ifdef PADDLE_MOBILE_OS_LINUX
-  munmap(ptr, 0);
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    munmap(ptr, size);
+    memory_map.erase(iter);
+  }
+  counter += size;
+  DLOG << size << " bytes freed. Total " << counter << " bytes";
 #else
  free(ptr);
 #endif

--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -20,8 +20,6 @@ limitations under the License. */
 #include <limits>
 #include "framework/tensor.h"
-// memory management;
 namespace paddle_mobile {
 namespace fpga {
@@ -45,9 +43,6 @@ struct MemoryCopyArgs {
  size_t size;
 };
-/**
-Conv and Pooling kernel
-*/
 struct KernelArgs {
  uint32_t width;
  uint32_t height;
@@ -109,7 +104,6 @@ struct PoolingArgs {
  struct ImageOutputArgs output;
 };
-// elementwise add arguments
 struct EWAddArgs {
  bool relu_enabled;

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -289,12 +289,8 @@ class Tensor {
    virtual std::type_index type() const { return type_; }
    virtual void set_type(std::type_index type) { type_ = type; }
-#ifndef PADDLE_MOBILE_FPGA
-    /*! the pointer of memory block. */
    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
-#else
-    std::shared_ptr<uint8_t> ptr_;
-#endif
    /*! the size of memory block. */
    size_t size_;

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -662,13 +662,15 @@ void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
 };
 template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult() {
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
-  auto last_op = ops.rbegin();
-  auto output_map = (*last_op)->Outputs();
+  PADDLE_MOBILE_ENFORCE(id < ops.size(), "Index out of range");
-  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  auto last_op = id < 0 ? ops[ops.size() - 1] : ops[id];
+  auto output_map = last_op->Outputs();
+  std::vector<std::string> out_keys = last_op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output");
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -99,7 +99,7 @@ class Executor {
 public:
  void InjectVariable(const framework::Tensor &t, string var_name);
  void FeedData(const framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> FetchResult();
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);
  void Predict_To(int end);

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -138,8 +138,8 @@ void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
 };
 template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult() {
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
-  return executor_->FetchResult();
+  return executor_->FetchResult(id);
 };
 template <typename Dtype, Precision P>

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -97,7 +97,7 @@ class PaddleMobile {
 public:
  void InjectVariable(const framework::Tensor &t, string var_name);
  void FeedData(const framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> FetchResult();
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);
  void Predict_To(int end);

--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -54,7 +54,13 @@ int main() {
    paddle_mobile.FeedData(input_tensor);
    paddle_mobile.Predict_To(10);
    paddle_mobile.Predict_From(10);
-    paddle_mobile.FetchResult();
+    auto tensor_ptr = paddle_mobile.FetchResult(9);
+    std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
+              << std::endl;
+    auto result_ptr = paddle_mobile.FetchResult();
+    std::cout << "Result tensor element number: " << result_ptr->numel()
+              << std::endl;
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
              << std::endl;