Optimize 5x5 depthwise conv for speedup 6x

a07503a7 · hjchen2 · a27e0055 · a07503a7 · a07503a7 · a07503a7
11 changed file
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -64,9 +64,10 @@ void OperatorBase<Dtype>::Run() {
  for (const auto key : input_keys) {
    auto var_vec_in = inputs_.at(key);
    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto vari = scope_->FindVar(var_vec_in[i]);
+      DLOG << var_vec_in[i];
+      auto vari = this->scope_->FindVar("input");
      if (vari->IsInitialized()) {
-        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+        const Tensor *tensor = vari->template Get<framework::LoDTensor>();
        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
      }
    }
@@ -76,7 +77,7 @@ void OperatorBase<Dtype>::Run() {
    for (int i = 0; i < var_vec_out.size(); ++i) {
      auto vari = scope_->FindVar(var_vec_out[i]);
      if (vari->IsInitialized()) {
-        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+        const Tensor *tensor = vari->template Get<framework::LoDTensor>();
        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
      }
    }
@@ -97,10 +98,10 @@ void OperatorBase<GPU_CL>::Run() {
      auto vari = scope_->FindVar(var_vec_in[i]);
      if (vari->IsInitialized()) {
        if (type_ == "feed") {
-          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          const Tensor *tensor = vari->template Get<framework::LoDTensor>();
          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
        } else {
-          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          const CLImage *cl_image = vari->template Get<framework::CLImage>();
          if (cl_image) {
            DLOG << type_ << " input- " << key << "=" << *cl_image;
          }
@@ -114,12 +115,12 @@ void OperatorBase<GPU_CL>::Run() {
      auto vari = scope_->FindVar(var_vec_out[i]);
      if (vari->IsInitialized()) {
        if (type_ == "fetch") {
-          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          const Tensor *tensor = vari->template Get<framework::LoDTensor>();
          if (tensor) {
            DLOG << type_ << " output- " << key << "=" << *tensor;
          }
        } else {
-          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          const CLImage *cl_image = vari->template Get<framework::CLImage>();
          if (cl_image) {
            DLOG << type_ << " output- " << key << "=" << *cl_image;
          }

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -14,6 +14,7 @@
 #include "io/api_paddle_mobile.h"
 #include <vector>
+#include "common/enforce.h"
 #include "framework/tensor.h"
 namespace paddle_mobile {

--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -12,19 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-/*
- * This file contains the implementation of inference API with Anakin engine
- * embeded, this API can only support Anakin models.
- */
 #pragma once
 #include <vector>
-#include "io/paddle_inference_api.h"
-// from paddle_mobile
-#include "common/enforce.h"
 #include "common/types.h"
+#include "io/paddle_inference_api.h"
 #include "io/paddle_mobile.h"
 namespace paddle_mobile {

--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -104,6 +104,8 @@ class PaddlePredictor {
  // The common configs for all the predictors.
  struct Config {
    std::string model_dir;  // path to the model directory.
+    std::string prog_file;
+    std::string param_file;
  };
 protected:
@@ -128,9 +130,8 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
  int batch_size = 1;
  bool optimize = true;
  bool quantification = false;
+  bool lod_mode = false;
  int thread_num = 1;
-  std::string prog_file;
-  std::string param_file;
  std::string cl_path;
  struct PaddleModelMemoryPack memory_pack;
 };

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "io/paddle_mobile.h"
 #include <utility>
 #include "common/common.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 #ifdef PADDLE_MOBILE_CL
 #include <CL/cl.h>
 #include "framework/cl/cl_tensor.h"
@@ -33,7 +36,7 @@ void PaddleMobile<Device, T>::SetThreadNum(int num) {
 template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
                                       bool optimize, bool quantification,
-                                       int batch_size, bool loddable) {
+                                       int batch_size, bool lod_mode) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
@@ -43,7 +46,7 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->Load(dirname, optimize, quantification), batch_size, optimize,
-        loddable);
+        lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -55,7 +58,7 @@ template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
                                       const std::string &para_path,
                                       bool optimize, bool quantification,
-                                       int batch_size, bool loddable) {
+                                       int batch_size, bool lod_mode) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
@@ -65,7 +68,7 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->Load(model_path, para_path, optimize, quantification),
-        batch_size, optimize, loddable);
+        batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -73,6 +76,21 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
  return PMSuccess;
 }
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
+  if (!config.model_dir.empty()) {
+    return this->Load(config.model_dir, config.optimize, config.quantification,
+                      config.batch_size, config.lod_mode);
+  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
+    return this->Load(config.prog_file, config.param_file, config.optimize,
+                      config.quantification, config.batch_size,
+                      config.lod_mode);
+  } else {
+    LOG(kLOG_ERROR) << "Failed to load inference model";
+    return PMNotInitialized;
+  }
+}
 template <typename Device, typename T>
 bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
                                                 const uint8_t *model_buf,

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -18,15 +18,12 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
 #include "common/types.h"
 #include "framework/executor.h"
 #include "framework/load_ops.h"
 #include "framework/loader.h"
 #include "framework/tensor.h"
+#include "io/paddle_inference_api.h"
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_engine.h"
 #endif
@@ -46,10 +43,12 @@ class PaddleMobile {
  PMStatus Load(const std::string &dirname, const bool optimize = false,
                const bool quantification = false, const int batch_size = 1,
-                const bool lod = false);
+                const bool lod_mode = false);
  PMStatus Load(const std::string &model_path, const std::string &para_path,
                const bool optimize = false, const bool quantification = false,
-                const int batch_size = 1, const bool lod = false);
+                const int batch_size = 1, const bool lod_mode = false);
+  PMStatus Load(const PaddleMobileConfig &config);
  PMStatus Predict(const framework::Tensor &input);
  PMStatus Predict(const framework::LoDTensor &input);

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -24,8 +24,12 @@ template <>
 bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
  bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
                 param->Filter()->dims()[2] == 3;
+  bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+                 param->Filter()->dims()[2] == 5;
  bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] &&
                  param->Input()->dims()[1] == param->Output()->dims()[1];
+  bool depth5x5 = conv5x5 && param->Groups() == param->Input()->dims()[1] &&
+                  param->Input()->dims()[1] == param->Output()->dims()[1];
  if (param->Filter()->type() == typeid(int8_t)) {
    if (depth3x3 && param->Strides()[0] < 3 &&
        param->Strides()[0] == param->Strides()[1]) {
@@ -46,6 +50,9 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
               param->Strides()[0] == 2 && param->Paddings()[0] == 1 &&
               param->Paddings()[0] == param->Paddings()[1]) {
      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT;
+    } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] &&
+               param->Strides()[0] == 1) {
+      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE5x5S1_FLOAT;
 #ifndef __aarch64__
    } else if (conv3x3 && param->Strides()[0] == param->Strides()[1] &&
               param->Dilations()[0] == param->Dilations()[1] &&
@@ -87,6 +94,10 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
                                 nullptr, false);
      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE5x5S1_FLOAT:
+      math::DepthwiseConv5x5S1<float, float>(*param.Input(), *param.Filter(),
+                                             param.Paddings(), param.Output());
+      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
      break;

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv3x3.h"
+#include "operators/math/depthwise_conv5x5.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/pad.h"

--- a/src/operators/math/depthwise_conv5x5.cpp
+++ b/src/operators/math/depthwise_conv5x5.cpp
--- a/src/operators/math/depthwise_conv5x5.h
+++ b/src/operators/math/depthwise_conv5x5.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "framework/tensor.h"
+#include "operators/math/conv_func.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// TODO(hjchen2) need to be implemented
+// template<typename Itype, typename Otype>
+// void DepthwiseConv5x5(const framework::Tensor *input,
+//                      const framework::Tensor *filter,
+//                      const std::vector<int> &strides,
+//                      const std::vector<int> &paddings,
+//                      framework::Tensor *output);
+template <typename Itype, typename Otype>
+void DepthwiseConv5x5S1(const framework::Tensor &input,
+                        const framework::Tensor &filter,
+                        const std::vector<int> &paddings,
+                        framework::Tensor *output);
+template <typename Itype, typename Otype>
+void DepthwiseConv5x5S2(const framework::Tensor &input,
+                        const framework::Tensor &filter,
+                        const std::vector<int> &paddings,
+                        framework::Tensor *output);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -424,6 +424,8 @@ class ConvParam : public OpParam {
    EXEC_DEPTHWISE3x3_FLOAT,
    EXEC_WINOGRAD3X3_FLOAT,
    EXEC_WINOGRAD5X5_FLOAT,
+    EXEC_DEPTHWISE5x5S1_FLOAT,
+    EXEC_DEPTHWISE5x5S2_FLOAT,
    EXEC_GEMM_INT8,
    EXEC_DEPTHWISE3x3_INT8,
  };
@@ -2598,8 +2600,8 @@ class QuantizeParam : public OpParam {
  // if offine scale or not
  bool offline_ = false;
  // round method type
-  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
+  RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
+  // RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
 };
 #endif