Merge remote-tracking branch 'upstream/develop' into develop

1ac296e1 · dolphin8 · ba82924d · c9a81686 · 1ac296e1 · 1ac296e1
8 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,8 +17,10 @@ elseif(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
 endif()
+set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 if (DEBUGING)
    set(CMAKE_BUILD_TYPE Debug)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS}")
 else()
    set(CMAKE_BUILD_TYPE Release)
 endif ()
@@ -55,7 +57,6 @@ else ()
    add_definitions(-DX86)
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
@@ -126,7 +127,7 @@ else ()
    add_definitions(-DCONV_OP)
    add_definitions(-DDEPTHWISECONV_OP)
    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSIONCONVADD_OP)
+    add_definitions(-DFUSION_CONVADD_OP)
    add_definitions(-DCONVADDRELU_OP)
    add_definitions(-DFUSION_FC_OP)
    add_definitions(-DLRN_OP)

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -96,74 +96,39 @@ class OpRegistry {
  }
 };
-#ifdef PADDLE_MOBILE_CPU
+#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
+  template <typename Dtype, typename T>                                    \
-#define REGISTER_OPERATOR_CPU(op_type, op_class)                               \
+  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
-  template <typename Dtype, typename T>                                        \
+   public:                                                                 \
-  class _OpClass_##op_type##_cpu : public op_class<Dtype, T> {                 \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);   \
-   public:                                                                     \
+  };                                                                       \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_cpu, op_class);                 \
+  static paddle_mobile::framework::OperatorRegistrar<                      \
-  };                                                                           \
+      device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
-  static paddle_mobile::framework::OperatorRegistrar<                          \
+      __op_registrar_##op_type##_##device_name(#op_type);                  \
-      paddle_mobile::CPU, _OpClass_##op_type##_cpu<paddle_mobile::CPU, float>> \
+  int TouchOpRegistrar_##op_type##_##device_name() {                       \
-      __op_registrar_##op_type##__cpu(#op_type);                               \
+    __op_registrar_##op_type##_##device_name.Touch();                      \
-  int TouchOpRegistrar_##op_type##_cpu() {                                     \
+    return 0;                                                              \
-    __op_registrar_##op_type##__cpu.Touch();                                   \
-    return 0;                                                                  \
  }
-#define USE_OP_CPU(op_type)                                       \
+#define REGISTER_OPERATOR_CPU(op_type, op_class) \
-  extern int TouchOpRegistrar_##op_type##_cpu();                  \
+  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_cpu()
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class)               \
-  template <typename Dtype, typename T>                             \
-  class _OpClass_##op_type##_mali_gpu : public op_class<Dtype, T> { \
-   public:                                                          \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_mali_gpu, op_class); \
-  };                                                                \
-  static paddle_mobile::framework::OperatorRegistrar<               \
-      paddle_mobile::CPU,                                           \
-      _OpClass_##op_type##_mali_gpu<paddle_mobile::CPU, float>>     \
-      __op_registrar_##op_type##__mali_gpu(#op_type);               \
-  int TouchOpRegistrar_##op_type##_mali_gpu() {                     \
-    __op_registrar_##op_type##__mali_gpu.Touch();                   \
-    return 0;                                                       \
-  }
-#define USE_OP_MALI_GPU(op_type)                                  \
+#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
-  extern int TouchOpRegistrar_##op_type##_mali_gpu();             \
+  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_mali_gpu()
+#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
-#endif
+#define USE_OP(op_type, device_name)                                           \
-#ifdef PADDLE_MOBILE_FPGA
+  extern int TouchOpRegistrar_##op_type##_##device_name();                     \
-#define REGISTER_OPERATOR_FPGA(op_type, op_class)               \
+  static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
-  template <typename Dtype, typename T>                         \
+      TouchOpRegistrar_##op_type##_##device_name()
-  class _OpClass_##op_type##_fpga : public op_class<Dtype, T> { \
-   public:                                                      \
+#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_fpga, op_class); \
-  };                                                            \
-  static paddle_mobile::framework::OperatorRegistrar<           \
-      paddle_mobile::CPU,                                       \
-      _OpClass_##op_type##_fpga<paddle_mobile::CPU, float>>     \
-      __op_registrar_##op_type##__fpga(#op_type);               \
-  int TouchOpRegistrar_##op_type##_fpga() {                     \
-    __op_registrar_##op_type##__fpga.Touch();                   \
-    return 0;                                                   \
-  }
-#define USE_OP_FPGA(op_type)                                      \
+#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
-  extern int TouchOpRegistrar_##op_type##_fpga();                 \
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_fpga()
-#endif
+#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -131,7 +131,6 @@ class Tensor {
    }
    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
    int64_t size = numel() * SizeOfType(type);
-    /* some versions of boost::variant don't have operator!= */
    if (holder_ == nullptr || holder_->size() < size + offset_) {
      holder_.reset(new PlaceholderImpl(size, type));
      offset_ = 0;

--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define FUSION_CONVADD_OP
 #ifdef FUSION_CONVADD_OP
 #pragma once

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -18,6 +18,27 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
+void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
+  auto bias_ptr = bias.data<float>();
+  const DDim bias_ddim = bias.dims();
+  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
+                        "the bias tensor's dims size != 1")
+  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
+  int outer_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+  bias.Resize(dDim);
+  auto new_ptr = bias.mutable_data<float>();
+  int axis_size = dDim[axis];
+  for (int i = 0; i < outer_size; ++i) {
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (int j = 0; j < inner_size; ++j) {
+      new_ptr[i * inner_size + j] = v_bias;
+    }
+  }
+}
 template <>
 void ConvAddKernel<CPU, float>::Compute(
    const FushionConvAddParam &param) const {
@@ -25,15 +46,16 @@ void ConvAddKernel<CPU, float>::Compute(
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
  Tensor *output = param.Output();
-  output->mutable_data<float>();
+  expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
  std::vector<int> dilations = param.Dilations();
-  //  DLOG << " compute end get Attrs " << strides[0];
  const int batch_size = static_cast<int>(input->dims()[0]);
  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
@@ -66,7 +88,6 @@ void ConvAddKernel<CPU, float>::Compute(
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
  filter.Resize(filter_matrix_shape);
-  DLOG << " filter.dims() = " << filter.dims();
  framework::DDim output_matrix_shape = {
      output->dims()[1],
      output->numel() / (output->dims()[0] * output->dims()[1])};
@@ -105,7 +126,7 @@ void ConvAddKernel<CPU, float>::Compute(
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
      math::matmul<float>(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
+                          static_cast<float>(1));
    }
  }
 }

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "framework/ddim.h"
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
@@ -26,6 +27,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
+using framework::DDim;
 using framework::OpKernelBase;
 template <typename DeviceType, typename T>

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -15,7 +15,6 @@ build_for_mac() {
    fi
    PLATFORM="x86"
    MODE="Release"
-    CXX_FLAGS="-std=c++11 -O3 -s"
    BUILD_DIR=../build/release/"${PLATFORM}"
    mkdir -p ${BUILD_DIR}/build
@@ -25,7 +24,6 @@ build_for_mac() {
    cmake .. \
        -B"${BUILD_DIR}" \
    	-DCMAKE_BUILD_TYPE="${MODE}" \
-    	-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
    	-DIS_MAC=true
    cd ${BUILD_DIR}
@@ -46,11 +44,11 @@ build_for_android() {
    if [ "${PLATFORM}" = "arm-v7a" ]; then
        ABI="armeabi-v7a with NEON"
        ARM_PLATFORM="V7"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
    elif [ "${PLATFORM}" = "arm-v8a" ]; then
        ABI="arm64-v8a"
        ARM_PLATFORM="V8"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
    else
        echo "unknown platform!"
        exit -1

--- a/tools/scripts/push2android.sh
+++ b/tools/scripts/push2android.sh