diff --git a/CMakeLists.txt b/CMakeLists.txt index 08237cd850ae20c515a39c8783a18deaac431626..5739c2a26039426ab544f762e9401445f01e7de7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,9 @@ endif() if(ANDROID) if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") + elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + # TODO: support glog for Android api 16 ~ 19 in the future + message(WARNING "Using the unofficial git repository instead") endif() set(WITH_GPU OFF CACHE STRING diff --git a/Dockerfile.android b/Dockerfile.android index 452aa1574550627c2cce6375e12e154a9763254d..9d13a414f67be04e17b7d83403228d92bce0eda9 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -6,13 +6,14 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub # ENV variables ARG ANDROID_ABI +ARG ANDROID_API ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} +ENV ANDROID_API=${ANDROID_API:-21} ENV HOME=/root \ ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \ - ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain + ANDROID_TOOLCHAINS_DIR=/opt/toolchains RUN apt-get update && \ apt-get install -y \ @@ -42,14 +43,12 @@ RUN pip install --upgrade pip && \ pip install pre-commit # Android NDK -RUN mkdir /opt/android-ndk-tmp && \ +RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \ + mkdir -p /opt/android-ndk-tmp && \ cd /opt/android-ndk-tmp && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ - rm -rf /opt/android-ndk-tmp && \ - rm -rf ${ANDROID_NDK_HOME} + rm -rf /opt/android-ndk-tmp CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"] diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 16e5bef4cdb8d6513de51838e3c3c8398dbad60d..01a2f4d5fa357ca882162247cc52299a3d1d3030 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -18,9 +18,9 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags) SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE) IF(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ELSE(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) @@ -56,3 +56,12 @@ SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) ADD_DEPENDENCIES(gflags extern_gflags) LIST(APPEND external_project_dependencies gflags) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) + IF(ANDROID) + INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib) + ENDIF() +ENDIF() diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 8a594a825abdca6a0f989b94fa42f97d6df5e10a..b450a3016667dcb4ab229fe7ec8aaae8609d8171 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -19,9 +19,9 @@ SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE) IF(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) ELSE(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) @@ -56,3 +56,12 @@ ADD_DEPENDENCIES(glog extern_glog gflags) LINK_LIBRARIES(glog gflags) LIST(APPEND external_project_dependencies glog) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog) + IF(ANDROID) + INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib) + ENDIF() +ENDIF() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index f9e05af59fed7a8ad049390eda2c94d8577db1e7..4fc8d43fc10891603b79c01a1c769cae21c52655 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -73,6 +73,26 @@ IF(NOT ${CBLAS_FOUND}) UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) + + IF(WITH_C_API) + INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) + # Because libopenblas.a is a symbolic link of another library, thus need to + # install the whole directory. + IF(ANDROID) + SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI}) + ELSE() + SET(TMP_INSTALL_DIR third_party/openblas/lib) + ENDIF() + INSTALL(CODE "execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib + destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} + )" + ) + INSTALL(CODE "MESSAGE(STATUS \"Installing: \" + \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" + )" + ) + ENDIF() ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index e629d61585c2d2ff916187ee28d4fd089a5bd857..a887be2e2ae5e21562fc15c775bb24cc1553480e 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -223,6 +223,15 @@ IF(NOT PROTOBUF_FOUND) SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) + IF(WITH_C_API) + INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) + IF(ANDROID) + INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib) + ENDIF() + ENDIF() + IF(CMAKE_CROSSCOMPILING) PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) ELSE() diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 45ca5542b7dc30216b45487782f849b93c5f8fca..5aecab90ca3cecdfdba0eac178a6ba07dfcb8745 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -49,3 +49,12 @@ ExternalProject_Add( ) LIST(APPEND external_project_dependencies zlib) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) + IF(ANDROID) + INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib) + ENDIF() +ENDIF() diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt index dde99ab3400be4e61bfe119fc272270518acf070..3af111eb5738c3f2f399ff4e5c06c8d2ecd8973e 100644 --- a/paddle/capi/CMakeLists.txt +++ b/paddle/capi/CMakeLists.txt @@ -64,9 +64,29 @@ link_paddle_exe(paddle_capi_shared) install(FILES ${CAPI_HEADERS} DESTINATION include/paddle) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle) if(ANDROID) + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1 + OUTPUT_VARIABLE GIT_COMMITS_LIST + RESULT_VARIABLE GIT_COMMITS_LIST_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(${GIT_COMMITS_LIST_RESULT}) + set(GIT_COMMITS_LIST "No commits.") + endif() install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib/${ANDROID_ABI}) install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI}) + install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt + \"Compiler:\n\" + \"\\t${CMAKE_C_COMPILER}\\n\" + \"\\t${CMAKE_CXX_COMPILER}\\n\" + \"Compiler Flags:\\n\" + \"\\t${CMAKE_F_FLAGS}\\n\" + \"\\t${CMAKE_CXX_FLAGS}\\n\" + \"Android API: ${CMAKE_SYSTEM_VERSION}\\n\" + \"Lastest commit:\\n\" + \"\\t${GIT_COMMITS_LIST}\\n\" + )" + ) else(ANDROID) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib) install(TARGETS paddle_capi_shared DESTINATION lib) diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md index c762811dfc190b255e0a3389885a081ce8315caf..0a6d762bc8be5201ac196b4bc6107c06d07a31d7 100644 --- a/paddle/framework/backward.md +++ b/paddle/framework/backward.md @@ -2,11 +2,22 @@ ## Motivation -In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass. +In Neural Network, many model is solved by the the backpropagation algorithm(known as BP) at present. Technically it caculates the gradient of the loss function, then distributed back through the networks. Follows the chain rule, so we need a module chains the gradient operators/expressions together with to construct the backward pass. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass. -## Backward Operator Registry +## Implementation -A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients. +In this design doc, we exported only one API for generating the backward pass. + +```c++ +std::unique_ptr Backward(const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars); +``` + +The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**. + +### Backward Operator Registry + +A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients. | | forward operator | backward operator | ---------------------- | ---------------- |------------------------- | @@ -25,7 +36,7 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad); `mul_grad` is the type of backward operator, and `MulOpGrad` is its class name. -## Backward Opeartor Creating +### Backward Opeartor Creating Given a certain forward operator, we can get its corresponding backward operator by calling: @@ -43,40 +54,47 @@ The function `BuildGradOp` will sequentially execute following processes: 4. Building backward operator with `inputs`, `outputs` and forward operator's attributes. -## Backward Network Building - -A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together. +### Backward Network Building -In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. - -given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`, `InputGradients`. +A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and append them together one by one. There is some corner case need to process specially. 1. Op - when the input forward network is an Op, return its gradient Operator Immediately. + When the input forward network is an Op, return its gradient Operator Immediately. If all of its outputs are in no gradient set, then return a special `NOP`. 2. NetOp - when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp. + In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp. + +3. RnnOp + + RnnOp is a nested stepnet operator. Backward module need to recusively call `Backward` for every stepnet. + +4. Sharing Variables + + **sharing variables**. As illustrated in the pictures, two operator's share the same variable name of W@GRAD, which will overwrite their sharing input variable. + +

+
- **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable. +​ pic 1. Sharing variables in operators. -

-
+

- 1. Shared variable in operators. +​ Sharing variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator to replace the overwrite links. -

+

+
- Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links. +​ pic 2. Replace sharing variable's gradient with `Add` operator. -

-
+

- 2. Replace shared variable's gradient with `Add` operator. +​ Because our framework finds variables accord to their names, we need to rename the output links. We add a suffix of number to represent its position in clockwise. -

+5. Part of Gradient is Zero. + In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator, we need to fill a same shape gradient matrix in the position. In our implement, we insert a special `fillZeroLike` operator. -​ Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it. +Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it. diff --git a/paddle/framework/images/duplicate_op2.graffle b/paddle/framework/images/duplicate_op2.graffle index ede3bca30ae17d5af52505fd94dc2f79b23b57e0..5cec3bc64dbd44dc99e348485969f29bd128ceb1 100644 Binary files a/paddle/framework/images/duplicate_op2.graffle and b/paddle/framework/images/duplicate_op2.graffle differ diff --git a/paddle/framework/images/duplicate_op2.png b/paddle/framework/images/duplicate_op2.png index 4e872dc2caf3b0cbd0d5176f11a14801b538dc86..21cdd5cabf1b5203e1435a75b57770d2f702fa92 100644 Binary files a/paddle/framework/images/duplicate_op2.png and b/paddle/framework/images/duplicate_op2.png differ diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/function/neon/NeonDepthwiseConv.h index aefeea78badbca3d0d09e292e4e1e148618f8ac6..33722d3cac61b62f5dce8f51105c1bf4e70c4a6c 100644 --- a/paddle/function/neon/NeonDepthwiseConv.h +++ b/paddle/function/neon/NeonDepthwiseConv.h @@ -594,7 +594,7 @@ struct StridePadding { float32x4_t s1 = vdupq_n_f32(0.f); for (int s = 0; s < step; s++) { float32x4_t s0 = vld1q_f32(input); - float32x4x2_t v = {s0, s1}; + float32x4x2_t v = {{s0, s1}}; vst2q_f32(inputPadding, v); input += 4; inputPadding += 8; diff --git a/paddle/gserver/layers/SwitchOrderLayer.cpp b/paddle/gserver/layers/SwitchOrderLayer.cpp index d7eee6eaf078dab8d48adc4c7ee758a433672ac6..e97809141a93106f9e6ebaf40c7e8aa9c6010557 100644 --- a/paddle/gserver/layers/SwitchOrderLayer.cpp +++ b/paddle/gserver/layers/SwitchOrderLayer.cpp @@ -83,8 +83,7 @@ void SwitchOrderLayer::forward(PassType passType) { setOutDims(); resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]); if (heightAxis_.size() > 0) { - getOutputValue()->reshape(reshapeHeight_, reshapeWidth_); - getOutputGrad()->reshape(reshapeHeight_, reshapeWidth_); + resetOutput(reshapeHeight_, reshapeWidth_); } // switch NCHW to NHWC diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1742925545d29df5d7df719faaea3b754680ab61 --- /dev/null +++ b/paddle/operators/elementwise_mul_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/elementwise_mul_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class ElementWiseMulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null"); + auto x_dim = ctx.Input("X")->dims(); + auto y_dim = ctx.Input("Y")->dims(); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input.") + ctx.Output("Out")->Resize(x_dim); + } +}; + +class ElementWiseMulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ElementWiseMulOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of elementwise mul op"); + AddInput("Y", "The second input of elementwise mul op"); + AddAttr("axis", + R"DOC( +When shape(Y) does not equal shape(X),Y will be broadcasted +to match the shape of X and axis should be dimension index Y in X + )DOC") + .SetDefault(-1) + .EqualGreaterThan(-1); + + AddOutput("Out", "The output of elementwise mul op"); + AddComment(R"DOC( +Limited elementwise multiple operator.The equation is: Out = X ⊙ Y. +1. The shape of Y should be same with X or +2. Y's shape is a subset of X. + Y will be broadcasted to match the shape of X and axis should be dimension index Y in X. + example: + shape(X) = (2, 3, 4, 5), shape(Y) = (,) + shape(X) = (2, 3, 4, 5), shape(Y) = (5,) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) + shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 +)DOC"); + } +}; + +class ElementWiseMulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_dims = ctx.Input("X")->dims(); + auto y_dims = ctx.Input("Y")->dims(); + auto out_dims = ctx.Input(framework::GradVarName("Out"))->dims(); + auto *x_grad = ctx.Output(framework::GradVarName("X")); + auto *y_grad = ctx.Output(framework::GradVarName("Y")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input.") + + if (x_grad) { + x_grad->Resize(x_dims); + } + + if (y_grad) { + y_grad->Resize(y_dims); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_mul, ops::ElementWiseMulOp, ops::ElementWiseMulOpMaker, + elementwise_mul_grad, ops::ElementWiseMulOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_mul, + ops::ElementWiseMulKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_mul_grad, + ops::ElementWiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..56f2087c22c6c599a3c5aef36eb0fe3eac295bef --- /dev/null +++ b/paddle/operators/elementwise_mul_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/elementwise_mul_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_GPU_KERNEL( + elementwise_mul, + ops::ElementWiseMulKernel); +REGISTER_OP_GPU_KERNEL( + elementwise_mul_grad, + ops::ElementWiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e9ed6791799240039f9af42c1a4339be7126ee65 --- /dev/null +++ b/paddle/operators/elementwise_mul_op.h @@ -0,0 +1,185 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { +/* + * Out = X ⊙ Y + * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + * pre=2, n=3*4, post=5 + * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) + * pre=2*3, n=4*5, post=1 + */ + +inline void get_mid_dims(const framework::DDim& x_dims, + const framework::DDim& y_dims, const int axis, + int& pre, int& n, int& post) { + pre = 1; + n = 1; + post = 1; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + + for (int i = 0; i < y_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], + "Broadcast dimension mismatch."); + n *= y_dims[i]; + } + + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + post *= x_dims[i]; + } +} + +template +class ElementWiseMulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto z_e = framework::EigenVector::Flatten(*z); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input.") + + if (x_dims == y_dims || product(y_dims) == 1) { + z_e.device(ctx.GetEigenDevice()) = x_e * y_e; + return; + } + + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + if (post == 1) { + auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + z_e.device(ctx.GetEigenDevice()) = x_e * y_bcast; + return; + } else { + auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + z_e.device(ctx.GetEigenDevice()) = x_e * y_bcast; + return; + } + } +}; + +template +class ElementWiseMulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dout_e = framework::EigenVector::Flatten(*dout); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + } + + if (x_dims == y_dims || product(y_dims) == 1) { + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(ctx.GetEigenDevice()) = dout_e * y_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(ctx.GetEigenDevice()) = x_e * dout_e; + } + return; + } + + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + + // TODO(gongweibao): wrap reshape to a function. + if (post == 1) { + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(ctx.GetEigenDevice()) = dout_e * y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(ctx.GetEigenDevice()) = + (x_e * dout_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + return; + } else { + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(ctx.GetEigenDevice()) = dout_e * y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(ctx.GetEigenDevice()) = + (x_e * dout_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + return; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e78b6ec133981494a65b5e16316ae8fdbd61a60 --- /dev/null +++ b/paddle/operators/pad_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/pad_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class PadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + auto x_dim = ctx.Input("X")->dims(); + auto paddings = Attr>("paddings"); + PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), + "Size of paddings should be equal to 2 * dimension size " + "of input tensor."); + std::vector out_dims(x_dim.size()); + for (int i = 0; i < x_dim.size(); ++i) { + out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; + } + ctx.Output("Out")->Resize(framework::make_ddim(out_dims)); + } +}; + +class PadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PadOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddOutput("Out", + "The output of pad op." + "A tensor with the same shape as X.") + .NotInGradient(); + AddComment(R"DOC( +Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Given: + +X = [[1, 2], + [3, 4]] + +and + +paddings = [0, 1, 1, 2] + +and + +pad_value = 0 + +then we get + +Out = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]] +)DOC"); + AddAttr>( + "paddings", + "A list to describes padding rules for each dimension." + " For 2-D image tensor, paddings=[0, 1, 2, 3] means" + " padding 0 row to top, 1 row to bottom, 2 columns to left" + " and 3 columns to right.Size of paddings should be equal to" + " 2 * dimension size of input tensor."); + AddAttr("pad_value", + "(float) default to 0; " + "The value to fill padded areas.") + .SetDefault(0.0f); + } +}; + +class PadOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx.Input("X")->dims(); + auto *x_grad = ctx.Output(framework::GradVarName("X")); + if (x_grad != nullptr) { + x_grad->Resize(x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(pad, ops::PadOp, ops::PadOpMaker, pad_grad, ops::PadOpGrad); +REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel); +REGISTER_OP_CPU_KERNEL(pad_grad, + ops::PadGradKernel); diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..555a7dba23c6fa2659cabf4858b42ff70d74bf18 --- /dev/null +++ b/paddle/operators/pad_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/pad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel); +REGISTER_OP_GPU_KERNEL(pad_grad, + ops::PadGradKernel); diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2cc3b945ae5b2e2e93d8531c7f99e4c215d1d806 --- /dev/null +++ b/paddle/operators/pad_op.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenTensor = framework::EigenTensor; + +template +void PadFunction(const framework::ExecutionContext& context) { + auto pads = context.Attr>("paddings"); + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = pads[i * 2]; + paddings[i].second = pads[i * 2 + 1]; + } + T pad_value = context.Attr("pad_value"); + + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + auto place = context.GetEigenDevice(); + out_tensor.device(place) = x_tensor.pad(paddings, pad_value); +} + +template +class PadKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + PadFunction(context); + break; + case 2: + PadFunction(context); + break; + case 3: + PadFunction(context); + break; + case 4: + PadFunction(context); + break; + case 5: + PadFunction(context); + break; + case 6: + PadFunction(context); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } + } +}; + +template +void PadGradFunction(const framework::ExecutionContext& context) { + auto pads = context.Attr>("paddings"); + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = -pads[i * 2]; + paddings[i].second = -pads[i * 2 + 1]; + } + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + d_x->mutable_data(context.GetPlace()); + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + auto place = context.GetEigenDevice(); + d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); + } +} + +template +class PadGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + PadGradFunction(context); + break; + case 2: + PadGradFunction(context); + break; + case 3: + PadGradFunction(context); + break; + case 4: + PadGradFunction(context); + break; + case 5: + PadGradFunction(context); + break; + case 6: + PadGradFunction(context); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index e61aa3a2a526409c6808c04072449240f3dc4455..96c4e88845b085966feca0415f1db0d398018650 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -35,6 +35,7 @@ USE_OP(add); USE_OP(onehot_cross_entropy); USE_OP(sgd); USE_OP(mul); +USE_OP(elementwise_mul); USE_OP(mean); USE_OP(sigmoid); USE_OP(softmax); @@ -49,6 +50,7 @@ USE_NO_KERNEL_OP(identity); USE_OP(minus); USE_OP(cos_sim); USE_CPU_ONLY_OP(gather); +USE_OP(pad); USE_CPU_ONLY_OP(scatter); USE_CPU_ONLY_OP(concat); USE_OP(top_k); diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index aabd2da5e499c8e648f2967e56c661ec37f025a1..11612ad4bed0afa8496087605afaefbd0420d5ce 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,8 +2,30 @@ set -xe +if [ $ANDROID_ABI == "arm64-v8a" ]; then + ANDROID_ARCH=arm64 +else # armeabi, armeabi-v7a + ANDROID_ARCH=arm +fi + +ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API + +cat </dev/null || true mkdir -p $BUILD_ROOT @@ -11,7 +33,7 @@ cd $BUILD_ROOT if [ $ANDROID_ABI == "armeabi-v7a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ @@ -26,7 +48,7 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then .. elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ -DHOST_C_COMPILER=/usr/bin/gcc \ @@ -40,12 +62,12 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ @@ -55,5 +77,10 @@ else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi +cat <