Merge pull request #145 from PaddlePaddle/develop

pull new code

Merge pull request #145 from PaddlePaddle/develop
pull new code
cc501377 · HappyAngel · GitHub · 95a96061 · faacbd27 · cc501377
6 changed file
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ For demands of Apple's GPU Metal and web front end inference, please see `./meta

 Paddle Lite has referenced the following open-source projects:

- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29)
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary%29)
 - [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite.  



--- a/README_cn.md
+++ b/README_cn.md
@@ -48,7 +48,7 @@ metal, web的模块相对独立，会继续在 `./metal` 和 `./web` 目录下

 ## 致谢：
 Paddle Lite 借鉴了以下开源项目：
- [ARM compute library]((https://github.com/ARM-software/ComputeLibrary))
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary)
 - [Anakin](https://github.com/PaddlePaddle/Anakin) ，Anakin对应底层的一些优化实现已被集成到Paddle Lite。Anakin作为PaddlePaddle组织下的一个高性能预测项目，极具前瞻性，对Paddle Lite有重要贡献。Anakin已和本项目实现整合。之后，Anakin不再升级。

 ##  交流与反馈

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -177,6 +177,8 @@ namespace lite_api {
 template <>
 std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
    const CxxConfig &config) {
+  static std::mutex mutex_conf;
+  std::unique_lock<std::mutex> lck(mutex_conf);
  auto x = std::make_shared<lite::CxxPaddleApiImpl>();
  x->Init(config);
  return x;

--- a/lite/core/variable.h
+++ b/lite/core/variable.h
@@ -33,19 +33,19 @@ class Variable {

  template <typename T>
  T* GetMutable() {
-    if (!blob_.is<T>()) blob_.set<T>();
+    if (!blob_.valid()) {
+      blob_.set<T>();
+    }
    return blob_.get_mutable<T>();
  }

  template <typename T>
  bool IsType() {
-    return blob_.type() == typeid(T).hash_code();
+    return blob_.is_type<T>();
  }

 private:
-  // variant<int, float, std::string, lite::Tensor> blob_;
-  variant<int, float, std::string, lite::Tensor, std::vector<lite::Tensor>>
-      blob_;
+  Any blob_;
 };

 }  // namespace lite

--- a/lite/kernels/arm/transpose_compute.cc
+++ b/lite/kernels/arm/transpose_compute.cc
@@ -25,26 +25,6 @@ namespace lite {
 namespace kernels {
 namespace arm {

-template <typename Dtype>
-void trans_basic(const int count,
-                 const Dtype* din,
-                 const int* permute_order,
-                 const int* old_steps,
-                 const int* new_steps,
-                 const int num_axes,
-                 Dtype* dout) {
-  for (int i = 0; i < count; ++i) {
-    int old_idx = 0;
-    int idx = i;
-    for (int j = 0; j < num_axes; ++j) {
-      int order = permute_order[j];
-      old_idx += (idx / new_steps[j]) * old_steps[order];
-      idx %= new_steps[j];
-    }
-    dout[i] = din[old_idx];
-  }
-}
-
 template <typename Dtype>
 void transpose_mat(const Dtype* din,
                   Dtype* dout,
@@ -201,6 +181,61 @@ void TransposeCompute::PrepareForRun() {
    _old_steps = get_stride(input->dims());
  }
 }
+
+template <typename Dtype>
+void TransposeCompute_(const std::vector<int>& axis,
+                       const lite::Tensor* input,
+                       lite::Tensor* output) {
+  // const Dtype *input_ptr = input->data<Dtype>();
+  const Dtype* input_ptr = input->data<float>();
+  Dtype* output_ptr = output->mutable_data<Dtype>();
+
+  // input and output's shape dimension must >= 2 && <= 6.
+  const DDim& in_dim = input->dims();
+  const DDim& out_dim = output->dims();
+
+  // precompute inverted output dim and strides
+  size_t rout_dim[6], strides[6];
+  int permute = axis.size();  // permute must >=2 && <= 6.
+  for (int i = 0; i < permute; ++i) {
+    int k = permute - 1 - i;
+    strides[k] = 1;
+    for (int j = axis[i] + 1; j < permute; ++j) {
+      strides[k] *= in_dim[j];
+    }
+    rout_dim[k] = out_dim[i];
+  }
+
+  // unroll the first 2 dimensions
+  int reamin_dim = 1;
+  for (int i = 2; i < out_dim.size(); ++i) {
+    reamin_dim *= out_dim[i];
+  }
+
+#pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int j = 0; j < out_dim[1]; ++j) {
+      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+      Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+      int indics[4] = {0, 0, 0, 0};
+      for (int k = 0; k < reamin_dim; ++k) {
+        out_ptr[k] = input_ptr[offset];
+        indics[0] += 1;
+        offset += strides[0];
+        for (int p = 0; p < permute - 3; ++p) {
+          if (indics[p] == rout_dim[p]) {
+            indics[p + 1] += 1;
+            indics[p] = 0;
+            offset += strides[p + 1];
+            offset -= rout_dim[p] * strides[p];
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
 // Transpose
 void TransposeCompute::Run() {
  auto& param = Param<operators::TransposeParam>();
@@ -220,13 +255,7 @@ void TransposeCompute::Run() {
  if (trans_mat) {
    transpose_mat(din, dout, _trans_num, _trans_w, _trans_h);
  } else {
-    trans_basic(output->numel(),
-                din,
-                param.axis.data(),
-                _old_steps.data(),
-                _new_steps.data(),
-                input->dims().size(),
-                dout);
+    TransposeCompute_<float>(axis, input, output);
  }
 }


--- a/lite/utils/any.h
+++ b/lite/utils/any.h
@@ -62,6 +62,9 @@ class Any {
  template <typename T, typename... Args>
  inline void construct(Args&&... args);

+  template <typename T>
+  inline bool is_type() const;
+
 private:
  template <typename T>
  class TypeOnHeap;
@@ -214,6 +217,14 @@ inline const std::type_info& Any::type() const {
  }
 }

+template <typename T>
+inline bool Any::is_type() const {
+  if ((type_ == nullptr) || (*(type_->ptype_info) != typeid(T))) {
+    return false;
+  }
+  return true;
+}
+
 template <typename T>
 inline void Any::check_type() const {
  CHECK_EQ((type_ == nullptr), false);