add NHWC NCHW transform, test=develop (#2381)

* add nhwc to nchw * add layout in funcs * change layout as extra, test=develop * change make, test=develop * use template class method to update layout NNCHHW and NHWC transform, test=develop * fix cmake error, set layout to extra, test=develop * fix test_layout_compute_arm test, its extra * layout is extra, test=develop * fix error in kernels/arm/layout_comput.cc when register kernel, DataLayout must be NCHW, test=develop * delete extra note, test=develop * delete extra test * delete layout_test, test=develop , its in tests/math/layout_comput_test * delete extrat test, test=develop

add NHWC NCHW transform, test=develop (#2381)
* add nhwc to nchw * add layout in funcs * change layout as extra, test=develop * change make, test=develop * use template class method to update layout NNCHHW and NHWC transform, test=develop * fix cmake error, set layout to extra, test=develop * fix test_layout_compute_arm test, its extra * layout is extra, test=develop * fix error in kernels/arm/layout_comput.cc when register kernel, DataLayout must be NCHW, test=develop * delete extra note, test=develop * delete extra test * delete layout_test, test=develop , its in tests/math/layout_comput_test * delete extrat test, test=develop
6b3c341f · HappyAngel · GitHub · 66d2ae25 · 6b3c341f · 6b3c341f
17 changed file
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -57,8 +57,8 @@ endif()

 if (NOT HAS_ARM_MATH_LIB_DIR)
  # TODO(xxx): seperate them and do not deps proto, eigen3
-  cc_library(math_arm SRCS  
-      funcs.cc 
+  cc_library(math_arm SRCS
+      funcs.cc
      packed_sgemm.cc
      packed_sgemm_c4.cc
      sgemm.cc
@@ -68,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      gemv_arm_int8.cc
      conv3x3s1_direct_fp32.cc
      conv3x3s2_direct_fp32.cc
-      conv3x3s1_depthwise_fp32.cc
-      conv3x3s2_depthwise_fp32.cc
+      conv3x3s1p01_depthwise_fp32.cc
+      conv3x3s2p01_depthwise_fp32.cc
+      conv3x3s1px_depthwise_fp32.cc
+      conv3x3s2px_depthwise_fp32.cc
      conv3x3s1_direct_int8.cc
      conv3x3s2_direct_int8.cc
      conv3x3s1_depthwise_int8.cc
@@ -77,16 +79,13 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      conv5x5s1_depthwise_int8.cc
      conv5x5s1_depthwise_fp32.cc
      conv5x5s2_depthwise_fp32.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_3x3s1.cc
-      conv_depthwise_3x3s2.cc
      conv_winograd_3x3.cc
      conv_impl.cc
-      softmax.cc 
+      softmax.cc
      scale.cc
      pooling.cc
      elementwise.cc
+      layout.cc
      lrn.cc
      decode_bboxes.cc
      concat.cc
@@ -122,4 +121,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      anchor_generator.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
- 
--- a/lite/backends/arm/math/conv_depthwise_3x3s1.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3s1.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/backends/arm/math/conv_depthwise.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv_depthwise_3x3s2.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3s2.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/backends/arm/math/conv_depthwise.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3p0.cc
--- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3p1.cc
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -361,7 +361,6 @@ void conv_im2col_gemm(const float* i_data,

  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
-
  //! use gemv when the output channel size = 1
  for (int b = 0; b < num; ++b) {
    // dC

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -39,6 +39,7 @@
 #include "lite/backends/arm/math/im2sequence.h"
 #include "lite/backends/arm/math/increment.h"
 #include "lite/backends/arm/math/interpolate.h"
+#include "lite/backends/arm/math/layout.h"
 #include "lite/backends/arm/math/lrn.h"
 #include "lite/backends/arm/math/negative.h"
 #include "lite/backends/arm/math/norm.h"

--- a/lite/backends/arm/math/layout.cc
+++ b/lite/backends/arm/math/layout.cc
--- a/lite/backends/arm/math/layout.h
+++ b/lite/backends/arm/math/layout.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+template <typename T>
+void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y);
+
+template <typename T>
+void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -145,6 +145,12 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kARM),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //

              KernelRegistryForTarget<TARGET(kOpenCL),
                                      PRECISION(kFloat),

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -46,6 +46,7 @@ add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${li
 add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(layout_compute_arm ARM extra SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_mean_compute_arm ARM extra SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -101,7 +102,6 @@ lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS tran
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
 lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
 lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
-
 if(LITE_BUILD_EXTRA)
    lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
    lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)

--- a/lite/kernels/arm/layout_compute.cc
+++ b/lite/kernels/arm/layout_compute.cc
--- a/lite/kernels/arm/layout_compute.h
+++ b/lite/kernels/arm/layout_compute.h
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
@@ -8,4 +8,10 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+
+    if(LITE_BUILD_EXTRA)
+        lite_cc_test(layout_compute_test SRCS layout_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    endif()
+    
+
 endif()
--- a/lite/tests/math/layout_compute_test.cc
+++ b/lite/tests/math/layout_compute_test.cc