Merge branch 'develop' into develop

bd465d62 · suiyang · GitHub · 149870a2 · 41578c1f · bd465d62
11 changed file
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -114,6 +114,9 @@ class CLEngine {

  cl_device_id DeviceID(int index = 0) { return devices_[index]; }

+  std::string GetCLPath() { return cl_path_; }
+  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
+
 private:
  CLEngine() { initialized_ = false; }

@@ -129,6 +132,7 @@ class CLEngine {

  cl_int status_;

+  std::string cl_path_;
  std::unique_ptr<_cl_program, CLProgramDeleter> program_;

  //  bool SetClContext();

--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -58,7 +58,8 @@ class CLScope {
    }

    auto program = CLEngine::Instance()->CreateProgramWith(
-        context_.get(), "./cl_kernel/" + file_name);
+        context_.get(),
+        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);

    DLOG << " --- begin build program -> " << file_name << " --- ";
    CLEngine::Instance()->BuildProgram(program.get());

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -649,12 +649,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {

 template <>
 void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
-  char *origin_data;
+  char *origin_data = nullptr;
+  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
+    self_alloc = true;
    origin_data = ReadFileToBuff(program_.para_path);
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
@@ -701,7 +703,9 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
      }
    }
  }
+  if (self_alloc) {
    delete origin_data;
+  }
  LOG(kLOG_INFO) << " end init combine memory ";
 }


--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -82,6 +82,54 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
    }
  }
 }
+template <>
+const Program<GPU_CL, Precision::FP32>
+Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
+    size_t read_size, const uint8_t *buf, size_t combined_params_len,
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+  bool can_add_split = false;
+
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      nullptr, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+
+  Program<GPU_CL, Precision::FP32> program;
+  program.combined = true;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+  program.combined_params_len = combined_params_len;
+  program.combined_params_buf = combined_params_buf;
+
+  auto scope = std::make_shared<Scope>();
+  program.scope = scope;
+  InitMemoryFromProgram(originProgramDesc, scope);
+  if (optimize) {
+    ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+    if (!program.optimizeProgram) {
+      program.optimizeProgram = originProgramDesc;
+    }
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
+                                                               nullptr);
+  return program;
+}
+
 #endif

 /**

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,9 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
-
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_->SetCLPath(config.cl_path);
+#endif
  if (config.memory_pack.from_memory) {
    DLOG << "load from memory!";
    paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,

--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -132,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
  int thread_num = 1;
  std::string prog_file;
  std::string param_file;
+  std::string cl_path;
  struct PaddleModelMemoryPack memory_pack;
 };


--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -158,6 +158,13 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 }
 #endif

+#ifdef PADDLE_MOBILE_CL
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
+  framework::CLEngine::Instance()->setClPath(path);
+}
+#endif
+
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -26,6 +26,9 @@ limitations under the License. */
 #include "framework/load_ops.h"
 #include "framework/loader.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_engine.h"
+#endif

 namespace paddle_mobile {

@@ -68,6 +71,11 @@ class PaddleMobile {
  void Predict_To(int end);
 #endif

+#ifdef PADDLE_MOBILE_CL
+ public:
+  void SetCLPath(std::string cl_path);
+#endif
+
 private:
  std::shared_ptr<framework::Loader<Dtype, P>> loader_;
  std::shared_ptr<framework::Executor<Dtype, P>> executor_;

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3307,9 +3307,14 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
      float *local_A = packedA + MC * KC * local_threads;
      float *local_C = packedC + MC * NC * local_threads;
      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      if (bias == nullptr) {
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, nullptr);
+      } else {
        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
                            &C(i, 0), ldc, relu, bias + i);
      }
+    }
  } else {
 #pragma omp parallel for
    for (int j = 0; j < n; j += NC) {

--- a/test/framework/test_load_memory_inference_api.cpp
+++ b/test/framework/test_load_memory_inference_api.cpp
@@ -55,11 +55,11 @@ static char *Get_binary_data(std::string filename) {
 paddle_mobile::PaddleMobileConfig GetConfig() {
  paddle_mobile::PaddleMobileConfig config;
  config.precision = paddle_mobile::PaddleMobileConfig::FP32;
-  config.device = paddle_mobile::PaddleMobileConfig::kCPU;
+  config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL;
  const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
      std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
-  auto model_path = std::string(g_genet_combine) + "/model";
-  auto params_path = std::string(g_genet_combine) + "/params";
+  auto model_path = std::string(g_mobilenet_combined) + "/model";
+  auto params_path = std::string(g_mobilenet_combined) + "/params";
  memory_pack->model_size =
      ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
  std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -22,7 +22,7 @@ int main() {
  auto time1 = paddle_mobile::time();
  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
  //                     std::string(g_mobilenet_detect) + "/params", true);
-
+  paddle_mobile.SetCLPath(".");
  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
  if (isok) {
    auto time2 = paddle_mobile::time();