diff --git a/.travis.yml b/.travis.yml index b4b83fcdbc84ce0fb0c91c816ebc3c964acfa590..e217c8f5a740ef5ab7315656ed7839ffa219c805 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ cache: - $HOME/.ccache - $HOME/.cache/pip - $TRAVIS_BUILD_DIR/build/third_party - - $TRAVIS_BUILD_DIR/build_android/third_party sudo: required dist: trusty os: @@ -12,7 +11,6 @@ os: env: - JOB=build_doc - JOB=check_style - - JOB=build_android addons: apt: packages: @@ -23,7 +21,6 @@ addons: - python - python-pip - python2.7-dev - - python-numpy - python-wheel - libboost-dev - curl @@ -37,8 +34,8 @@ before_install: - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt - - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker + - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt + - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker - curl https://glide.sh/get | bash - eval "$(GIMME_GO_VERSION=1.8.3 gimme)" - go get -u github.com/alecthomas/gometalinter diff --git a/CMakeLists.txt b/CMakeLists.txt index ad559672ad2f83a3d62cdf332b47c6cf1e730f70..08237cd850ae20c515a39c8783a18deaac431626 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE) endif() if(ANDROID) - if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") - message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21") + if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") + message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") endif() set(WITH_GPU OFF CACHE STRING diff --git a/Dockerfile.android b/Dockerfile.android index c0fa58c384f9ebcae60477ffce49ea4ffa929db9..452aa1574550627c2cce6375e12e154a9763254d 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors ARG UBUNTU_MIRROR RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' +# ENV variables +ARG ANDROID_ABI + +ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} + ENV HOME=/root \ ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc + ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \ + ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain RUN apt-get update && \ apt-get install -y \ @@ -15,12 +21,11 @@ RUN apt-get update && \ apt-get clean -y # Install Go and glide -RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ - tar -C /usr/local -xzf go.tgz && \ +RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ - mkdir /root/gopath/src && \ - rm go.tgz + mkdir /root/gopath/src ENV GOROOT=/usr/local/go GOPATH=/root/gopath # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin @@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ rm -rf /opt/android-ndk-tmp && \ rm -rf ${ANDROID_NDK_HOME} diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 5e3e437a8da9624df35a5c754fe77be73f20361d..84219cfa5587f5b765b2e8f35180797d7053169f 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -20,6 +20,7 @@ # The supported variables are listed belows: # # ANDROID_STANDALONE_TOOLCHAIN +# ANDROID_TOOLCHAIN # ANDROID_ABI # ANDROID_NATIVE_API_LEVEL # ANDROID_ARM_MODE @@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL) ENDIF() ENDIF() +IF(NOT DEFINED ANDROID_TOOLCHAIN) + SET(ANDROID_TOOLCHAIN clang) +ENDIF() + IF(NOT DEFINED ANDROID_ABI) SET(ANDROID_ABI "armeabi-v7a") ENDIF() @@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") "${CMAKE_VERSION}), when cross-compiling for Android.") IF(ANDROID_STANDALONE_TOOLCHAIN) + # Use standalone toolchain SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot") IF(NOT CMAKE_SYSTEM_VERSION) @@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ENDIF() # Toolchain - SET(ANDROID_TOOLCHAIN "gcc") SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN}) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) - IF(ANDROID_ABI STREQUAL "armeabi") - SET(CMAKE_SYSTEM_PROCESSOR armv5te) - ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") - SET(CMAKE_SYSTEM_PROCESSOR armv7-a) - ENDIF() - ENDIF() - IF(ANDROID_ABI STREQUAL "arm64-v8a") - SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) - SET(CMAKE_SYSTEM_PROCESSOR aarch64) + ELSE(ANDROID_NDK) + # TODO: use android ndk + ENDIF() + + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) + IF(ANDROID_ABI STREQUAL "armeabi") + SET(CMAKE_SYSTEM_PROCESSOR armv5te) + SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi) + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_SYSTEM_PROCESSOR armv7-a) + SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi) ENDIF() - SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) + SET(CMAKE_SYSTEM_PROCESSOR aarch64) + SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.") + ENDIF() + SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + + IF(ANDROID_TOOLCHAIN STREQUAL clang) + SET(ANDROID_C_COMPILER_NAME clang) + SET(ANDROID_CXX_COMPILER_NAME clang++) + SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc) + SET(ANDROID_C_COMPILER_NAME gcc) + SET(ANDROID_CXX_COMPILER_NAME g++) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}") ENDIF() # C compiler IF(NOT CMAKE_C_COMPILER) - SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc") + SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}") ELSE() GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) ENDIF() @@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") # CXX compiler IF(NOT CMAKE_CXX_COMPILER) - SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++") + SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}") ELSE() GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) ENDIF() @@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) # Toolchain and ABI specific flags. - SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64") + SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections") SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections") IF(ANDROID_ABI STREQUAL "armeabi") @@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") -march=armv5te -mtune=xscale -msoft-float) - ENDIF() - IF(ANDROID_ABI STREQUAL "armeabi-v7a") + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv7-a -mfloat-abi=softfp) @@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16) ENDIF() LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) ENDIF() IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") @@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ELSE() LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb) ENDIF() + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # Disable integrated-as for better compatibility. + LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as) + ENDIF() ENDIF() - IF(ANDROID_ABI STREQUAL "arm64-v8a") - LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # CMake automatically forwards all compiler flags to the linker, + # and clang doesn't like having -Wa flags being used for linking. + # To prevent CMake from doing this would require meddling with + # the CMAKE__COMPILE_OBJECT rules, which would get quite messy. + LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments) ENDIF() STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 0002a470d90f722e3f9106ca56d70e6bf2cea339..f9e05af59fed7a8ad049390eda2c94d8577db1e7 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +IF(USE_EIGEN_FOR_BLAS) + return() +ENDIF(USE_EIGEN_FOR_BLAS) + INCLUDE(cblas) IF(NOT ${CBLAS_FOUND}) diff --git a/doc/design/functions_operators_layers.md b/doc/design/functions_operators_layers.md index 7a2e8fd0ace2e3f4462b15215de22c31e944b7cb..d23ba56b5773a36d448a99e4abdebc1475ed789c 100644 --- a/doc/design/functions_operators_layers.md +++ b/doc/design/functions_operators_layers.md @@ -86,12 +86,13 @@ def layer.fc(X): We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example: -``` + | C++ functions/functors | mul | add | | | +|------------------------|--------------|--------------|-------------|----------| | C++ operator class | mulOp | addOp | FCOp | | | Python binding | operator.mul | operator.add | operator.fc | | | Python function | | | | layer.fc | -``` + This is how we differentiate layer and operators in PaddlePaddle: diff --git a/doc/design/ops/dist_train.md b/doc/design/ops/dist_train.md new file mode 100644 index 0000000000000000000000000000000000000000..fa3c5d7990213cf2b0d236e66e592dd2699da876 --- /dev/null +++ b/doc/design/ops/dist_train.md @@ -0,0 +1,106 @@ +# Design Doc: Operation Graph Based Parameter Server + +## Abstract + +We propose an approach to implement the parameter server. In this +approach, there is no fundamental difference between the trainer and +the parameter server: they both run subgraphs, but subgraphs of +different purposes. + +## Background + +The previous implementations of the parameter server does not run a +subgraph. parameter initialization, optimizer computation, network +communication and checkpointing are implemented twice on both the +trainer and the parameter server. + +It would be great if we can write code once and use them on both the +trainer and the parameter server: reduces code duplication and +improves extensibility. Given that after the current refactor, we are +representing everything as a computing graph on the +trainer. Representing everything as a computing graph on the parameter +server becomes a natural extension. + +## Design + +### Graph Converter + +The *graph converter* converts the user-defined operation (OP) graph +into subgraphs to be scheduled on different nodes with the following +steps: + +1. OP placement: the OPs will be placed on different nodes according + to heuristic that minimizes estimated total computation + time. Currently we will use a simple heuristic that puts parameter + varable on parameter server workers and everything else on trainer + workers. + +1. Add communication OPs to enable the communication between nodes. + +We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*. + +Below is an example of converting the user defined graph to the +subgraphs for the trainer and the parameter server: + + + +After converting: + + + +1. The parameter variable W and it's optimizer subgraph are placed on the parameter server. +1. Operators are added to the subgraphs. + - *Send* sends data to the connected *Recv* operator. The + scheduler on the receive node will only schedule *Recv* operator + to run when the *Send* operator has ran (the *Send* OP will mark + the *Recv* OP runnable automatically). + - *Enueue* enqueues the input variable, it can block until space + become available in the queue. + - *Dequeue* outputs configurable numbers of tensors from the + queue. It will block until the queue have the required number of + tensors. + + +### Benefits + +- Model parallelism become easier to implement: it's an extension to + the trainer - parameter server approach. we already have the + communication OPs, but need to extend the graph converter's + placement functionality. + +- User-defined optimizer is easier to add - user can now express it as + a subgraph. + +- No more duplication logic inside the trainer and the parameter + server mentioned in the background section. + +### Challenges + +- It might be hard for the graph converter to cut a general graph + (without any hint for which subgraph is the optimizer). We may need + to label which subgraph inside the OP graph is the optimizer. + +- It's important to balance the parameter shards of on multiple + parameter server. If a single parameter is very big (some + word-embedding, fully connected, softmax layer), we need to + automatically partition the single parameter onto different + parameter servers when possible (only element-wise optimizer depends + on the parameter variable). + +### Discussion + +- In the "Aync SGD" figure, the "W" variable on the parameter server + could be read and wrote concurrently, what is our locking strategy? + E.g., each variable have a lock cpp method to be invoked by every + OP, or, have a lock OP. + +- Can the Enqueue OP be implemented under our current tensor design + (puts the input tensor into the queue tensor)? + +- *Dequeue* OP will have variable numbers of output (depends on the + `min_count` attribute), does our current design support it? (similar + question for the *Add* OP) + + +### References: +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/design/ops/src/dist-graph.graffle b/doc/design/ops/src/dist-graph.graffle new file mode 100644 index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b Binary files /dev/null and b/doc/design/ops/src/dist-graph.graffle differ diff --git a/doc/design/ops/src/dist-graph.png b/doc/design/ops/src/dist-graph.png new file mode 100644 index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7 Binary files /dev/null and b/doc/design/ops/src/dist-graph.png differ diff --git a/doc/design/ops/src/local-graph.graffle b/doc/design/ops/src/local-graph.graffle new file mode 100644 index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5 Binary files /dev/null and b/doc/design/ops/src/local-graph.graffle differ diff --git a/doc/design/ops/src/local-graph.png b/doc/design/ops/src/local-graph.png new file mode 100644 index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7 Binary files /dev/null and b/doc/design/ops/src/local-graph.png differ diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md new file mode 100644 index 0000000000000000000000000000000000000000..86a95c10d5729704f86c285c9fe92db0cf2158be --- /dev/null +++ b/doc/design/var_desc.md @@ -0,0 +1,124 @@ +## Background +PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime. + +PaddlePaddle use proto message to describe compile time graph for + +1. Computation graph should be able to be saved to a file. +1. In distributed training, the graph will be serialized and send to multiple workers. + +The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below. + +| |compile time|runtime| +|---|---|---| +|Data|VarDesc(proto)|Variable(cpp)| +|Operation|OpDesc(proto)|Operator(cpp)| + + +## Definition of VarDesc + +A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it. + +```proto +message VarDesc { + required string name = 1; + optional LoDTensorDesc lod_tensor = 2; +} +``` + +## Definition of LodTensorDesc + +```proto +enum DataType { + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; +} + +message LoDTensorDesc { + required DataType data_type = 1; + repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] + optional int32 lod_level = 3 [default=0]; +} +``` + +## Definition of Variable in Python + +In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable. + +```python +image = Variable(dims=[-1, 640, 480]) +# fc1 and fc2 are both Variable +fc1 = layer.fc(input=image, output_size=10) +fc2 = layer.fc(input=fc1, output_size=20) +``` +### what should class `Variable` Have +1. `name`.a name of string type is used to mark the value of the Variable. +1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator. +1. `operator`. Variable should record which operator produce itself. The reaon is: + - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable. + +In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph. + +```python +import VarDesc +import LoDTensorDesc +import framework + +def AddInitialOperator(variable, initializer): + # add an initialize Operator to block to init this Variable + +class Variable(object): + def __init__(self, name, dims, type, initializer): + self._block = get_default_block() + self._name = name + self.op = None + + tensor_desc = LoDTensorDesc(data_type=type, dims=dims) + _var_desc = VarDesc(name=name, lod_tensor=tensor_desc) + self._var = framework.CreateVar(_var_desc) + self._block.add_var(self) + + # add initial op according to initializer + if initializer is not None: + AddInitialOperator(self, initializer) + + def dims(self): + return self._var.dims() + + def data_type(self): + return self._var.data_type() + + def to_proto(self): + pass +``` + +Then we can use this Variable to create a fc layer in Python. + +```python +import paddle as pd + +def flatten_size(X, num_flatten_dims): + prod = 1 # of last num_flatten_dims + for i in xrange(num_flatten_dims): + prod = prod * X.dims[-i-1] + return prod + +def layer.fc(X, output_size, num_flatten_dims): + W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size]) + b = Variable(pd.random_uniform(), type=FP32, dims=[output_size]) + out = Variable(type=FP32) + y = operator.fc(X, W, b, output=out) # fc will put fc op input into out + pd.InferShape(y) + return out + +x = Variable(dims=[-1, 640, 480]) +y = layer.fc(x, output_size=100) +z = layer.fc(y, output_size=200) + +paddle.eval(targets=[z], ...) +print(z) +``` diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index c0a37ced2a72a1ab410025e2aa45313c23f1349a..e4f6bf42c61694e9826a127c9628730cfd43ada7 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -18,14 +18,6 @@ limitations under the License. */ #ifndef __NVCC__ -#include "paddle/math/MathFunctions.h" - -#ifndef PADDLE_TYPE_DOUBLE -#define CBLAS_GEMM paddle::gemm -#else -#define CBLAS_GEMM paddle::gemm -#endif - template void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, real *gateValue, @@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput, } } -template -void hl_cpu_gru_forward(OpResetOutput opResetOutput, - OpFinalOutput opFinalOutput, - hl_gru_value value, - int frameSize, - int batchSize, - hl_activation_mode_t active_node, - hl_activation_mode_t active_gate) { - if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - 2 * frameSize, - frameSize, - 1, - value.prevOutValue, - frameSize, - value.gateWeight, - frameSize * 2, - 1, - value.gateValue, - frameSize * 3); - } - - forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); - - if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - frameSize, - frameSize, - 1, - value.resetOutputValue, - frameSize, - value.stateWeight, - frameSize, - 1, - value.gateValue + frameSize * 2, - frameSize * 3); - } - - forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); -} - template void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, real *gateValue, @@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad, } } -template -void hl_cpu_gru_backward(OpStateGrad opStateGrad, - OpResetGrad opResetGrad, - hl_gru_value value, - hl_gru_grad grad, - int frameSize, - int batchSize, - hl_activation_mode_t active_node, - hl_activation_mode_t active_gate) { - backward_state_grad(opStateGrad, value, grad, - frameSize, batchSize, active_node); - - if (value.prevOutValue && grad.prevOutGrad) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize, - 1, - grad.gateGrad + frameSize * 2, - frameSize * 3, - value.stateWeight, - frameSize, - 0, - grad.resetOutputGrad, - frameSize); - - if (grad.stateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize, - batchSize, - 1, - value.resetOutputValue, - frameSize, - grad.gateGrad + frameSize * 2, - frameSize * 3, - 1, - grad.stateWeightGrad, - frameSize); - } - } - - backward_reset_grad(opResetGrad, value, grad, - frameSize, batchSize, active_gate); - - if (grad.prevOutGrad && value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize * 2, - 1, - grad.gateGrad, - frameSize * 3, - value.gateWeight, - frameSize * 2, - 1, - grad.prevOutGrad, - frameSize); - - if (grad.gateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize * 2, - batchSize, - 1, - value.prevOutValue, - frameSize, - grad.gateGrad, - frameSize * 3, - 1, - grad.gateWeightGrad, - frameSize * 2); - } - } -} - #endif #endif // HL_CPU_GRU_CUH_ diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h index cde3dfa1d3d19b1bee9fd23dad52ecbbe628c3a9..2b788a76cafe198abb9aed8ba842e37cc6ff73a6 100644 --- a/paddle/framework/attribute.h +++ b/paddle/framework/attribute.h @@ -45,7 +45,19 @@ class GreaterThanChecker { public: explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} void operator()(T& value) const { - PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail"); + PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); + } + + private: + T lower_bound_; +}; + +template +class EqualGreaterThanChecker { + public: + explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(T& value) const { + PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); } private: @@ -115,6 +127,11 @@ class TypedAttrChecker { return *this; } + TypedAttrChecker& EqualGreaterThan(const T& lower_bound) { + value_checkers_.push_back(EqualGreaterThanChecker(lower_bound)); + return *this; + } + // we can add more common limits, like LessThan(), Between()... TypedAttrChecker& SetDefault(const T& default_value) { diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md index 8aa6728a95bc464ab8884986f0cec6c817d3303b..c762811dfc190b255e0a3389885a081ce8315caf 100644 --- a/paddle/framework/backward.md +++ b/paddle/framework/backward.md @@ -2,20 +2,20 @@ ## Motivation -In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass. - +In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass. + ## Backward Operator Registry -A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients. +A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients. | | forward operator | backward operator | ---------------------- | ---------------- |------------------------- | | **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients | | **Operator::outputs_** | Outputs | InputGradients | - In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced. + In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced. -For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro: +For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro: ```cpp REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad); @@ -27,17 +27,17 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad); ## Backward Opeartor Creating -Given a certain forward operator, we can get its corresponding backward opeartor by calling: +Given a certain forward operator, we can get its corresponding backward operator by calling: ```cpp OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op); -``` +``` The function `BuildGradOp` will sequentially execute following processes: 1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`. -2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing. +2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing. 3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`. @@ -49,31 +49,31 @@ A backward network is a series of backward operators. The main idea of building In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. -given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`. +given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`, `InputGradients`. 1. Op - when the input forward network is a Op, return its gradient Operator Immediately. + when the input forward network is an Op, return its gradient Operator Immediately. 2. NetOp - when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp. + when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp. - **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable. + **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable.

-
+
- 1. shared variable in two operators. + 1. Shared variable in operators.

- Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. + Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links.

-
+
- 2. replace shared variable gradient with `Add` Operator + 2. Replace shared variable's gradient with `Add` operator.

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc index 85b7de79743bb0390d66b8999f2e8342a51d14a9..fc3d508553c0e966978b28d58127bdbff10d45f1 100644 --- a/paddle/framework/ddim.cc +++ b/paddle/framework/ddim.cc @@ -283,5 +283,14 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) { DDim::DDim(std::initializer_list init_list) { *this = make_ddim(init_list); } + +DDim flatten_to_2d(const DDim& src, int num_col_dims) { + int rank = src.size(); + return make_ddim({product(slice_ddim(src, 0, num_col_dims)), + product(slice_ddim(src, num_col_dims, rank))}); +} + +DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } + } // namespace framework } // namespace paddle diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h index db30c523948b1d437615aa0e9bfecb5e25569296..ca29e7e8c7776de6adf3e3b0e8f11f0d4d8487c3 100644 --- a/paddle/framework/ddim.h +++ b/paddle/framework/ddim.h @@ -115,6 +115,12 @@ int arity(const DDim& ddim); std::ostream& operator<<(std::ostream&, const DDim&); +// Reshape a tensor to a matrix. The matrix's first dimension(column length) +// will be the product of tensor's first `num_col_dims` dimensions. +DDim flatten_to_2d(const DDim& src, int num_col_dims); + +DDim flatten_to_1d(const DDim& src); + } // namespace framework } // namespace paddle diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h index 2d8d9ae10c56e0632414a5bbc754d35bfa9ce6a5..54bbeafcabdeeb1e2c1017c156b3512c83dada3a 100644 --- a/paddle/framework/eigen.h +++ b/paddle/framework/eigen.h @@ -63,20 +63,35 @@ struct EigenTensor { template -struct EigenMatrix : public EigenTensor {}; +struct EigenMatrix : public EigenTensor { + static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } + + static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, + int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } +}; template struct EigenVector : public EigenTensor { // Flatten reshapes a Tensor into an EigenVector. static typename EigenVector::Type Flatten(Tensor& tensor) { - return EigenVector::From( - tensor, make_ddim({static_cast(product(tensor.dims_))})); + return EigenVector::From(tensor, {product(tensor.dims_)}); } static typename EigenVector::ConstType Flatten(const Tensor& tensor) { - return EigenVector::From( - tensor, make_ddim({static_cast(product(tensor.dims_))})); + return EigenVector::From(tensor, {product(tensor.dims_)}); } }; diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc index dc1957691b1a202826e10e84c21ac8874df9e378..bc4a2db32cfba66bef2c444e1f822e0d2a57b91e 100644 --- a/paddle/framework/eigen_test.cc +++ b/paddle/framework/eigen_test.cc @@ -108,5 +108,24 @@ TEST(Eigen, Matrix) { } } +TEST(Eigen, MatrixReshape) { + Tensor t; + float* p = t.mutable_data({2, 3, 6, 4}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 6 * 4; ++i) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::Reshape(t, 2); + + ASSERT_EQ(2 * 3, em.dimension(0)); + ASSERT_EQ(6 * 4, em.dimension(1)); + + for (int i = 0; i < 2 * 3; i++) { + for (int j = 0; j < 6 * 4; j++) { + ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f); + } + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/images/duplicate_op2.graffle b/paddle/framework/images/duplicate_op2.graffle index 2b658085d6a55d368c320051ba7f94ec2900f13c..ede3bca30ae17d5af52505fd94dc2f79b23b57e0 100644 Binary files a/paddle/framework/images/duplicate_op2.graffle and b/paddle/framework/images/duplicate_op2.graffle differ diff --git a/paddle/framework/images/duplicate_op2.png b/paddle/framework/images/duplicate_op2.png index c5588015d1450fd8c1bda3580680d884494868bb..4e872dc2caf3b0cbd0d5176f11a14801b538dc86 100644 Binary files a/paddle/framework/images/duplicate_op2.png and b/paddle/framework/images/duplicate_op2.png differ diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 643f875491724bf443bd7727391734377ee6180c..ce938b21437195fed8c1adad4329fd139f3f96ab 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -43,6 +43,9 @@ class Tensor { template friend struct EigenTensor; + template + friend struct EigenMatrix; + template friend struct EigenVector; diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 94f436294f350e2a39785a09959efb3b17bd00a5..637f04ae0037bd402d855b8bcde8087bfe8328d1 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -148,5 +148,13 @@ inline Tensor& Tensor::Resize(const DDim& dims) { inline const DDim& Tensor::dims() const { return dims_; } +template +inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { + Tensor res; + res.ShareDataWith(src); + res.Resize(flatten_to_2d(src.dims(), num_col_dims)); + return res; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 7db38d5caeebccf710334e854faf785ef0f64063..55302ea47120f420e952b26830c8ea4cbcce6435 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -262,3 +262,16 @@ TEST(Tensor, CopyFrom) { } #endif } + +TEST(Tensor, ReshapeToMatrix) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + int* src_ptr = src.mutable_data({2, 3, 4, 9}, CPUPlace()); + for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { + src_ptr[i] = i; + } + Tensor res = ReshapeToMatrix(src, 2); + ASSERT_EQ(res.dims()[0], 2 * 3); + ASSERT_EQ(res.dims()[1], 4 * 9); +} \ No newline at end of file diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index f43f15e5cacb70b625d7791e1e02ce7780286200..4fd72d64a90ae6f16dd1499ceb7fba6e40fe4cea 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -44,6 +44,7 @@ if(WITH_GPU) add_simple_unittest(RowConvOpTest) add_simple_unittest(BlockExpandOpTest) add_simple_unittest(CropOpTest) + add_simple_unittest(SwitchOpTest) endif() add_simple_unittest(Im2ColTest) diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp index 674141ed39b7f5573948348e3ba3bb526ae43c66..b3e666e860d29d89650d48a23cf44917035a02d7 100644 --- a/paddle/function/EigenGemm.cpp +++ b/paddle/function/EigenGemm.cpp @@ -83,9 +83,9 @@ struct EigenBlasGemm { }; #ifdef PADDLE_TYPE_DOUBLE -template class EigenBlasGemm; +template struct EigenBlasGemm; #else -template class EigenBlasGemm; +template struct EigenBlasGemm; #endif } // namespace paddle diff --git a/paddle/function/GruFunctor.h b/paddle/function/GruFunctor.h new file mode 100644 index 0000000000000000000000000000000000000000..9f6392198ea360502f313cbe15dfae46ece69758 --- /dev/null +++ b/paddle/function/GruFunctor.h @@ -0,0 +1,159 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "GemmFunctor.h" +#include "hl_cpu_gru.cuh" + +namespace paddle { + +template +struct GruFunctor { + template + static void compute(OpResetOutput opResetOutput, + OpFinalOutput opFinalOutput, + hl_gru_value value, + int frameSize, + int batchSize, + hl_activation_mode_t active_node, + hl_activation_mode_t active_gate) { +#ifndef __NVCC__ + if (value.prevOutValue) { + BlasGemm::compute(false, + false, + batchSize, + 2 * frameSize, + frameSize, + 1, + value.prevOutValue, + frameSize, + value.gateWeight, + frameSize * 2, + 1, + value.gateValue, + frameSize * 3); + } + + forward_reset_output( + opResetOutput, value, frameSize, batchSize, active_gate); + + if (value.prevOutValue) { + BlasGemm::compute(false, + false, + batchSize, + frameSize, + frameSize, + 1, + value.resetOutputValue, + frameSize, + value.stateWeight, + frameSize, + 1, + value.gateValue + frameSize * 2, + frameSize * 3); + } + + forward_final_output( + opFinalOutput, value, frameSize, batchSize, active_node); +#endif + } +}; + +template +struct GruGradFunctor { + template + static void compute(OpStateGrad opStateGrad, + OpResetGrad opResetGrad, + hl_gru_value value, + hl_gru_grad grad, + int frameSize, + int batchSize, + hl_activation_mode_t active_node, + hl_activation_mode_t active_gate) { +#ifndef __NVCC__ + backward_state_grad( + opStateGrad, value, grad, frameSize, batchSize, active_node); + + if (value.prevOutValue && grad.prevOutGrad) { + BlasGemm::compute(false, + true, + batchSize, + frameSize, + frameSize, + 1, + grad.gateGrad + frameSize * 2, + frameSize * 3, + value.stateWeight, + frameSize, + 0, + grad.resetOutputGrad, + frameSize); + + if (grad.stateWeightGrad) { + BlasGemm::compute(true, + false, + frameSize, + frameSize, + batchSize, + 1, + value.resetOutputValue, + frameSize, + grad.gateGrad + frameSize * 2, + frameSize * 3, + 1, + grad.stateWeightGrad, + frameSize); + } + } + + backward_reset_grad( + opResetGrad, value, grad, frameSize, batchSize, active_gate); + + if (grad.prevOutGrad && value.prevOutValue) { + BlasGemm::compute(false, + true, + batchSize, + frameSize, + frameSize * 2, + 1, + grad.gateGrad, + frameSize * 3, + value.gateWeight, + frameSize * 2, + 1, + grad.prevOutGrad, + frameSize); + + if (grad.gateWeightGrad) { + BlasGemm::compute(true, + false, + frameSize, + frameSize * 2, + batchSize, + 1, + value.prevOutValue, + frameSize, + grad.gateGrad, + frameSize * 3, + 1, + grad.gateWeightGrad, + frameSize * 2); + } + } +#endif + } +}; + +} // namespace paddle diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 9b91e223a6a28586b11fe7ed4a44421e029a67bb..1e0cff436ff60d5a029e89657d00af2b0bf8b454 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -94,95 +94,4 @@ public: int paddingWidth); }; -template -struct Padding { - static void run(const T* src, - T* dest, - int channels, - int inputHeight, - int inputWidth, - int paddingHeight, - int paddingWidth) { - const int destWidth = inputWidth + 2 * paddingWidth; - for (int c = 0; c < channels; c++) { - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(T)); - dest += destWidth * paddingHeight; - } - - for (int i = 0; i < inputHeight; i++) { - // padding head - for (int j = 0; j < paddingWidth; j++) { - *dest++ = T(0); - } - - memcpy(dest, src, inputWidth * sizeof(T)); - dest += inputWidth; - src += inputWidth; - - // padding tail - for (int j = 0; j < paddingWidth; j++) { - *dest++ = T(0); - } - } - - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(T)); - dest += destWidth * paddingHeight; - } - } - } -}; - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -template <> -struct Padding { - static void run(const float* src, - float* dest, - int channels, - int inputHeight, - int inputWidth, - int paddingHeight, - int paddingWidth) { - const int destWidth = inputWidth + 2 * paddingWidth; - for (int c = 0; c < channels; c++) { - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(float)); - dest += destWidth * paddingHeight; - } - - for (int i = 0; i < inputHeight; i++) { - // padding head - for (int j = 0; j < paddingWidth; j++) { - *dest++ = float(0); - } - - int step = inputWidth >> 2; - int remain = inputWidth & 3; - for (int s = 0; s < step; s++) { - float32x4_t s0 = vld1q_f32(src); - vst1q_f32(dest, s0); - src += 4; - dest += 4; - } - for (int r = 0; r < remain; r++) { - *dest++ = *src++; - } - - // padding tail - for (int j = 0; j < paddingWidth; j++) { - *dest++ = float(0); - } - } - - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(float)); - dest += destWidth * paddingHeight; - } - } - } -}; - -#endif - } // namespace paddle diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 91b4b8ed91b6055babcfbab8f7adb2c55e2747d0..25e41edad54bec0f76a3de4799fab14241407272 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -13,18 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "MulOp.h" -/// todo(tianbing), delete it -#include -#include "paddle/math/MathFunctions.h" +#include "GemmFunctor.h" #include "paddle/math/SIMDFunctions.h" #include "paddle/utils/ThreadLocal.h" -#ifndef PADDLE_TYPE_DOUBLE -#define GEMM paddle::gemm -#else -#define GEMM paddle::gemm -#endif - namespace { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { for (unsigned int i = 0; i < len; ++i) { @@ -114,19 +106,20 @@ void MulOp(CpuMatrix& out, real scaleT, bool aTrans, bool bTrans) { - GEMM(aTrans ? CblasTrans : CblasNoTrans, - bTrans ? CblasTrans : CblasNoTrans, - out.getHeight(), - out.getWidth(), - !aTrans ? a.getWidth() : a.getHeight(), - scaleAB, - a.getData(), - a.getStride(), - b.getData(), - b.getStride(), - scaleT, - out.getData(), - out.getStride()); + BlasGemm::compute( + aTrans, + bTrans, + out.getHeight(), + out.getWidth(), + !aTrans ? a.getWidth() : a.getHeight(), + scaleAB, + a.getData(), + a.getStride(), + b.getData(), + b.getStride(), + scaleT, + out.getData(), + out.getStride()); } /// dense matrix (+)= sparse matrix * dense matrix diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..01e252a8dc0cd5fa1e964efa01d04cf282b3dfe7 --- /dev/null +++ b/paddle/function/SwitchOp.cpp @@ -0,0 +1,140 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "SwitchOp.h" +#include "paddle/math/Vector.h" + +namespace paddle { + +template <> +void NCHW2NHWC(real* outputs, + const real* inputs, + const int num, + const int inC, + const int inH, + const int inW, + const int argType) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < inC; ++c) { + for (int h = 0; h < inH; ++h) { + for (int w = 0; w < inW; ++w) { + if (argType == ADD_TO) { + outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++); + } else { + outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++); + } + } + } + } + } +} + +template <> +void NHWC2NCHW(real* outputs, + const real* inputs, + const int num, + const int inH, + const int inW, + const int inC, + const int argType) { + for (int n = 0; n < num; ++n) { + for (int h = 0; h < inH; ++h) { + for (int w = 0; w < inW; ++w) { + for (int c = 0; c < inC; ++c) { + if (argType == ADD_TO) { + outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++); + } else { + outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++); + } + } + } + } + } +} + +/** + * \brief Switch dimension order of image input. + * The input and output is a 4D tensor. Switch order + * 'batch_size,channels, height, width' to + * order 'batch_size, height, width, channels'. + * + * Argument in this Function: + * \param inputs input data with order 'batch_size,channels, height, width'. + * \param outputs output data with order 'batch_size, height, width, channels'. + */ +template +class NCHW2NHWCFunc : public FunctionBase { +public: + void init(const FuncConfig& config) override {} + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(1UL, inputs.size()); + CHECK_EQ(1UL, outputs.size()); + + size_t num = inputs[0].shape()[0]; + size_t inC = inputs[0].shape()[1]; + size_t inH = inputs[0].shape()[2]; + size_t inW = inputs[0].shape()[3]; + NCHW2NHWC(outputs[0].data(), + inputs[0].data(), + num, + inC, + inH, + inW, + outputs[0].getArgType()); + } +}; + +/** + * \brief Switch dimension order of image input. + * The input and output is a 4D tensor. Switch order + * 'batch_size, height, width, channels' to + * order 'batch_size, channels, height, width'. + * + * Argument in this Function: + * \param inputs input data with order 'batch_size, height, width, channels'. + * \param outputs output data with order 'batch_size, channels, height, width'. + */ +template +class NHWC2NCHWFunc : public FunctionBase { +public: + void init(const FuncConfig& config) override {} + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(1UL, inputs.size()); + CHECK_EQ(1UL, outputs.size()); + + size_t num = inputs[0].shape()[0]; + size_t inH = inputs[0].shape()[1]; + size_t inW = inputs[0].shape()[2]; + size_t inC = inputs[0].shape()[3]; + + NHWC2NCHW(outputs[0].data(), + inputs[0].data(), + num, + inH, + inW, + inC, + outputs[0].getArgType()); + } +}; + +REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc); +REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc); +#ifndef PADDLE_ONLY_CPU +REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc); +REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc); +#endif + +} // namespace paddle diff --git a/paddle/function/SwitchOp.h b/paddle/function/SwitchOp.h new file mode 100644 index 0000000000000000000000000000000000000000..e4c1c3ac922f88c3e5424b5943082810aabfacdb --- /dev/null +++ b/paddle/function/SwitchOp.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Function.h" + +namespace paddle { + +/** + * \brief This funtion switch dimension order of image input. + * The input and output is a 4D tensor. Switch order 'batch_size, + *channels, height, width' to + * order 'batch_size, height, width, channels'. + * + * \param[out] outputs save results. + * \param[in] inputs input data. + * \param[in] num batch size of input data. + * \param[in] inC channel number of input data. + * \param[in] inH height of input data. + * \param[in] inH with of input data. + * \param[in] argType type of output argument. + */ +template +void NCHW2NHWC(real* outputs, + const real* inputs, + const int num, + const int inC, + const int inH, + const int inW, + const int argtype); + +/** + * \brief This funtion switch dimension order of image input. + * The input and output is a 4D tensor. Switch order 'batch_size, + *height, width, channels' to + * order 'batch_size, channels, height, width'. + * + * \param[out] inGrad gradients of previous layer. + * \param[in] outGrad output gradients. + * \param[in] num batch size of input data. + * \param[in] inH height of input data. + * \param[in] inW with of input data. + * \param[in] inC channel number of input data. + * \param[in] argType type of output argument. + */ +template +void NHWC2NCHW(real* inGrad, + const real* outGrad, + const int num, + const int inH, + const int inW, + const int inC, + const int argType); +} // namespace paddle diff --git a/paddle/function/SwitchOpGpu.cu b/paddle/function/SwitchOpGpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..45390a56c3f776ec18a65a6ba2f7149a7a6ef6c3 --- /dev/null +++ b/paddle/function/SwitchOpGpu.cu @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 Paddle + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "SwitchOp.h" +#include "hl_base.h" + +namespace paddle { + +__global__ void KeNCHW2NHWC(real* outputs, + const real* inputs, + int inC, + int inH, + int inW, + int nthreads, + int argType) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < nthreads) { + const int w = idx % inW; + const int h = (idx / inW) % inH; + const int c = (idx / inW / inH) % inC; + const int n = idx / inW / inH / inC; + + const int off = ((n * inH + h) * inW + w) * inC + c; + if (argType == ADD_TO) { + outputs[off] += inputs[idx]; + } else { + outputs[off] = inputs[idx]; + } + } +} + +template <> +void NCHW2NHWC(real* outputs, + const real* inputs, + const int num, + const int inC, + const int inH, + const int inW, + const int argType) { + size_t nth = num * inC * inH * inW; + int blockSize = 1024; + int gridSize = (nth + 1024 - 1) / 1024; + KeNCHW2NHWC<<>>( + outputs, inputs, inC, inH, inW, nth, argType); + CHECK_SYNC("NCHW2NHWC"); +} + +__global__ void KeNHWC2NCHW(real* outputs, + const real* inputs, + int inH, + int inW, + int inC, + int nthreads, + int argType) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < nthreads) { + const int c = idx % inC; + const int w = (idx / inC) % inW; + const int h = (idx / inC / inW) % inH; + const int n = idx / inW / inH / inC; + + const int off = ((n * inC + c) * inH + h) * inW + w; + if (argType == ADD_TO) { + outputs[off] += inputs[idx]; + } else { + outputs[off] = inputs[idx]; + } + } +} + +template <> +void NHWC2NCHW(real* outputs, + const real* inputs, + const int num, + const int inH, + const int inW, + const int inC, + const int argType) { + int nth = num * inC * inH * inW; + int blockSize = 1024; + int gridSize = (nth + 1024 - 1) / 1024; + KeNHWC2NCHW<<>>( + outputs, inputs, inH, inW, inC, nth, argType); + CHECK_SYNC("NHWC2NCHW"); +} + +} // namespace paddle diff --git a/paddle/function/SwitchOpTest.cpp b/paddle/function/SwitchOpTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..03b0dd66ddcbab713969ed747601ecb1b2eb7955 --- /dev/null +++ b/paddle/function/SwitchOpTest.cpp @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "FunctionTest.h" + +namespace paddle { + +TEST(Pad, real) { + for (size_t numSamples : {1, 4, 8, 16}) { + for (size_t channels : {1, 4, 8, 16}) { + for (size_t imgSizeH : {1, 4, 8, 16}) { + for (size_t imgSizeW : {1, 4, 8, 16}) { + VLOG(3) << " numSamples=" << numSamples << " channels=" << channels + << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; + for (bool test_grad : {true, false}) { + CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC", + FuncConfig()); + TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW}; + TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels}; + compare.addInputs( + BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims)); + compare.addOutputs(BufferArg( + VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO)); + compare.run(); + } + } + } + } + } +} + +} // namespace paddle diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index f09e98587d1681d29a79a9cb0303c2d4356c6935..18126152ea0b4ebfe4ec5c8084479787814ed173 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -12,468 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "neon_util.h" +#include "NeonDepthwiseConv.h" #include "paddle/function/ConvOp.h" -#include "paddle/function/Im2Col.h" namespace paddle { -namespace neon { - #if defined(__ARM_NEON__) || defined(__ARM_NEON) -template -struct DepthwiseConvKernel {}; - -inline float32_t conv3x3(float32x4_t r0, - float32x4_t r1, - float32x4_t r2, - float32x4_t k0, - float32x4_t k1, - float32x4_t k2) { - float32x4_t tmp; - tmp = vmulq_f32(r0, k0); - tmp = vmlaq_f32(tmp, r1, k1); - tmp = vmlaq_f32(tmp, r2, k2); - return vaddvq_f32(tmp); -} - -inline float32_t conv4x4(float32x4_t r0, - float32x4_t r1, - float32x4_t r2, - float32x4_t r3, - float32x4_t k0, - float32x4_t k1, - float32x4_t k2, - float32x4_t k3) { - float32x4_t tmp; - tmp = vmulq_f32(r0, k0); - tmp = vmlaq_f32(tmp, r1, k1); - tmp = vmlaq_f32(tmp, r2, k2); - tmp = vmlaq_f32(tmp, r3, k3); - return vaddvq_f32(tmp); -} - -/** - * Each step calculates four elements of the output. - * First step: - * R0[0, 1, 2, 3...] * K[0][0] - * R0[1, 2, 3, 4...] * K[0][1] - * R0[2, 3, 4, 5...] * K[0][2] - * R1[0, 1, 2, 3...] * K[1][0] - * R1[1, 2, 3, 4...] * K[1][1] - * R1[2, 3, 4, 5...] * K[1][2] - * R2[0, 1, 2, 3...] * K[2][0] - * R2[1, 2, 3, 4...] * K[2][1] - * + R2[2, 3, 4, 5...] * K[2][2] - * ------------------------------ - * Output[0, 1, 2, 3] - */ -template <> -struct DepthwiseConvKernel<3, 1> { - static void run(const float* inputData, - const float* filterData, - int inputHeight, - int inputWidth, - int outputChannels, - int outputHeight, - int outputWidth, - int filterMultiplier, - float* outputData) { - const int steps = outputWidth >> 2; - const int remain = outputWidth & 3; - for (int c = 0; c < outputChannels; c++, filterData += 9) { - // Load the filters - float32x4_t k[3]; - k[0] = vld1q_f32(filterData); - k[1] = vld1q_f32(filterData + 3); - k[2] = vld1q_f32(filterData + 6); - k[0] = vsetq_lane_f32(0.f, k[0], 3); - k[1] = vsetq_lane_f32(0.f, k[1], 3); - k[2] = vsetq_lane_f32(0.f, k[2], 3); - - const float* r0 = - inputData + (c / filterMultiplier) * (inputHeight * inputWidth); - const float* r1 = r0 + inputWidth; - const float* r2 = r0 + inputWidth * 2; - float32x4_t input[3][3]; - for (int h = 0; h < outputHeight; h++) { - for (int s = 0; s < steps; s++) { - // Load the inputs - float32x4_t tmp; - input[0][0] = vld1q_f32(r0); - tmp = vld1q_f32(r0 + 4); - input[0][1] = vextq_f32(input[0][0], tmp, 1); - input[0][2] = vextq_f32(input[0][0], tmp, 2); - input[1][0] = vld1q_f32(r1); - tmp = vld1q_f32(r1 + 4); - input[1][1] = vextq_f32(input[1][0], tmp, 1); - input[1][2] = vextq_f32(input[1][0], tmp, 2); - input[2][0] = vld1q_f32(r2); - tmp = vld1q_f32(r2 + 4); - input[2][1] = vextq_f32(input[2][0], tmp, 1); - input[2][2] = vextq_f32(input[2][0], tmp, 2); - - float32x4_t tmp1 = vdupq_n_f32(0.f); - float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp1 = vaddq_f32(tmp1, tmp2); - - vst1q_f32(outputData, tmp1); - r0 += 4; - r1 += 4; - r2 += 4; - outputData += 4; - } - - for (int r = 0; r < remain; r++) { - float32x4_t i0 = vld1q_f32(r0); - float32x4_t i1 = vld1q_f32(r1); - float32x4_t i2 = vld1q_f32(r2); - *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); - r0++; - r1++; - r2++; - outputData++; - } - - r0 += 2; - r1 += 2; - r2 += 2; - } - } - } -}; - -/** - * Each step calculates four elements of the output. - * First step: - * R0[0, 2, 4, 6...] * K[0][0] - * R0[1, 3, 5, 7...] * K[0][1] - * R0[2, 4, 6, 8...] * K[0][2] - * R1[0, 2, 4, 6...] * K[1][0] - * R1[1, 3, 5, 7...] * K[1][1] - * R1[2, 4, 6, 8...] * K[1][2] - * R2[0, 2, 4, 6...] * K[2][0] - * R2[1, 3, 5, 7...] * K[2][1] - * R2[2, 4, 6, 8...] * K[2][2] - * ------------------------------ - * Output[0, 1, 2, 3] - */ -template <> -struct DepthwiseConvKernel<3, 2> { - static void run(const float* inputData, - const float* filterData, - int inputHeight, - int inputWidth, - int outputChannels, - int outputHeight, - int outputWidth, - int filterMultiplier, - float* outputData) { - const int steps = outputWidth >> 2; - const int remain = outputWidth & 3; - for (int c = 0; c < outputChannels; c++, filterData += 9) { - // Load the filters - float32x4_t k[3]; - k[0] = vld1q_f32(filterData); - k[1] = vld1q_f32(filterData + 3); - k[2] = vld1q_f32(filterData + 6); - k[0] = vsetq_lane_f32(0.f, k[0], 3); - k[1] = vsetq_lane_f32(0.f, k[1], 3); - k[2] = vsetq_lane_f32(0.f, k[2], 3); - - const float* start = - inputData + (c / filterMultiplier) * (inputHeight * inputWidth); - float32x4_t input[3][3]; - for (int h = 0; h < outputHeight; h++) { - const float* r0 = start + 2 * h * inputWidth; - const float* r1 = start + (2 * h + 1) * inputWidth; - const float* r2 = start + (2 * h + 2) * inputWidth; - for (int s = 0; s < steps; s++) { - // Load the inputs - float32x4_t data1; - float32x4x2_t data2; - - data2 = vld2q_f32(r0); - input[0][0] = data2.val[0]; - input[0][1] = data2.val[1]; - data1 = vld1q_f32(r0 + 8); - input[0][2] = vextq_f32(data2.val[0], data1, 1); - - data2 = vld2q_f32(r1); - input[1][0] = data2.val[0]; - input[1][1] = data2.val[1]; - data1 = vld1q_f32(r1 + 8); - input[1][2] = vextq_f32(data2.val[0], data1, 1); - - data2 = vld2q_f32(r2); - input[2][0] = data2.val[0]; - input[2][1] = data2.val[1]; - data1 = vld1q_f32(r2 + 8); - input[2][2] = vextq_f32(data2.val[0], data1, 1); - - float32x4_t tmp1 = vdupq_n_f32(0.f); - float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp1 = vaddq_f32(tmp1, tmp2); - - vst1q_f32(outputData, tmp1); - r0 += 8; - r1 += 8; - r2 += 8; - outputData += 4; - } - - for (int r = 0; r < remain; r++) { - float32x4_t i0 = vld1q_f32(r0); - float32x4_t i1 = vld1q_f32(r1); - float32x4_t i2 = vld1q_f32(r2); - *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); - r0 += 2; - r1 += 2; - r2 += 2; - outputData++; - } - } - } - } -}; - -/** - * Each step calculates four elements of the output. - */ -template <> -struct DepthwiseConvKernel<4, 1> { - static void run(const float* inputData, - const float* filterData, - int inputHeight, - int inputWidth, - int outputChannels, - int outputHeight, - int outputWidth, - int filterMultiplier, - float* outputData) { - const int steps = outputWidth >> 2; - const int remain = outputWidth & 3; - for (int c = 0; c < outputChannels; c++, filterData += 16) { - // Load the filters - float32x4_t k[4]; - k[0] = vld1q_f32(filterData); - k[1] = vld1q_f32(filterData + 4); - k[2] = vld1q_f32(filterData + 8); - k[3] = vld1q_f32(filterData + 12); - - const float* r0 = - inputData + (c / filterMultiplier) * (inputHeight * inputWidth); - const float* r1 = r0 + inputWidth; - const float* r2 = r0 + inputWidth * 2; - const float* r3 = r0 + inputWidth * 3; - float32x4_t input[4][4]; - for (int h = 0; h < outputHeight; h++) { - for (int s = 0; s < steps; s++) { - // Load the inputs - float32x4_t tmp; - input[0][0] = vld1q_f32(r0); - tmp = vld1q_f32(r0 + 4); - input[0][1] = vextq_f32(input[0][0], tmp, 1); - input[0][2] = vextq_f32(input[0][0], tmp, 2); - input[0][3] = vextq_f32(input[0][0], tmp, 3); - - input[1][0] = vld1q_f32(r1); - tmp = vld1q_f32(r1 + 4); - input[1][1] = vextq_f32(input[1][0], tmp, 1); - input[1][2] = vextq_f32(input[1][0], tmp, 2); - input[1][3] = vextq_f32(input[1][0], tmp, 3); - - input[2][0] = vld1q_f32(r2); - tmp = vld1q_f32(r2 + 4); - input[2][1] = vextq_f32(input[2][0], tmp, 1); - input[2][2] = vextq_f32(input[2][0], tmp, 2); - input[2][3] = vextq_f32(input[2][0], tmp, 3); - - input[3][0] = vld1q_f32(r3); - tmp = vld1q_f32(r3 + 4); - input[3][1] = vextq_f32(input[3][0], tmp, 1); - input[3][2] = vextq_f32(input[3][0], tmp, 2); - input[3][3] = vextq_f32(input[3][0], tmp, 3); - - float32x4_t tmp1 = vdupq_n_f32(0.f); - float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); - tmp1 = vaddq_f32(tmp1, tmp2); - - vst1q_f32(outputData, tmp1); - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - outputData += 4; - } - - for (int r = 0; r < remain; r++) { - float32x4_t i0 = vld1q_f32(r0); - float32x4_t i1 = vld1q_f32(r1); - float32x4_t i2 = vld1q_f32(r2); - float32x4_t i3 = vld1q_f32(r3); - *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); - r0++; - r1++; - r2++; - r3++; - outputData++; - } - - r0 += 3; - r1 += 3; - r2 += 3; - r3 += 3; - } - } - } -}; - -/** - * Each step calculates four elements of the output. - */ -template <> -struct DepthwiseConvKernel<4, 2> { - static void run(const float* inputData, - const float* filterData, - int inputHeight, - int inputWidth, - int outputChannels, - int outputHeight, - int outputWidth, - int filterMultiplier, - float* outputData) { - const int steps = outputWidth >> 2; - const int remain = outputWidth & 3; - for (int c = 0; c < outputChannels; c++, filterData += 16) { - // Load the filters - float32x4_t k[4]; - k[0] = vld1q_f32(filterData); - k[1] = vld1q_f32(filterData + 4); - k[2] = vld1q_f32(filterData + 8); - k[3] = vld1q_f32(filterData + 12); - - const float* start = - inputData + (c / filterMultiplier) * (inputHeight * inputWidth); - float32x4_t input[4][4]; - for (int h = 0; h < outputHeight; h++) { - const float* r0 = start + 2 * h * inputWidth; - const float* r1 = start + (2 * h + 1) * inputWidth; - const float* r2 = start + (2 * h + 2) * inputWidth; - const float* r3 = start + (2 * h + 3) * inputWidth; - for (int s = 0; s < steps; s++) { - // Load the inputs - float32x4x2_t data1; - float32x4x2_t data2; - - data1 = vld2q_f32(r0); - data2 = vld2q_f32(r0 + 8); - input[0][0] = data1.val[0]; - input[0][1] = data1.val[1]; - input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1); - input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1); - - data1 = vld2q_f32(r1); - data2 = vld2q_f32(r1 + 8); - input[1][0] = data1.val[0]; - input[1][1] = data1.val[1]; - input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1); - input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1); - - data1 = vld2q_f32(r2); - data2 = vld2q_f32(r2 + 8); - input[2][0] = data1.val[0]; - input[2][1] = data1.val[1]; - input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1); - input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1); - - data1 = vld2q_f32(r3); - data2 = vld2q_f32(r3 + 8); - input[3][0] = data1.val[0]; - input[3][1] = data1.val[1]; - input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1); - input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1); - - float32x4_t tmp1 = vdupq_n_f32(0.f); - float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); - tmp1 = vaddq_f32(tmp1, tmp2); - - vst1q_f32(outputData, tmp1); - r0 += 8; - r1 += 8; - r2 += 8; - r3 += 8; - outputData += 4; - } - - for (int r = 0; r < remain; r++) { - float32x4_t i0 = vld1q_f32(r0); - float32x4_t i1 = vld1q_f32(r1); - float32x4_t i2 = vld1q_f32(r2); - float32x4_t i3 = vld1q_f32(r3); - *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); - r0 += 2; - r1 += 2; - r2 += 2; - r3 += 2; - outputData++; - } - } - } - } -}; - template class NeonDepthwiseConvFunction : public ConvFunctionBase { public: @@ -497,16 +42,16 @@ public: const TensorShape& filter = inputs[1].shape(); const TensorShape& output = outputs[0].shape(); - size_t batchSize = input[0]; - size_t inputChannels = input[1]; - size_t inputHeight = input[2]; - size_t inputWidth = input[3]; - size_t filterHeight = getFilterHeight(filter); - size_t filterWidth = getFilterWidth(filter); - size_t outputChannels = output[1]; - size_t outputHeight = output[2]; - size_t outputWidth = output[3]; - size_t filterMultiplier = outputChannels / groups_; + int batchSize = input[0]; + int inputChannels = input[1]; + int inputHeight = input[2]; + int inputWidth = input[3]; + int filterHeight = getFilterHeight(filter); + int filterWidth = getFilterWidth(filter); + int outputChannels = output[1]; + int outputHeight = output[2]; + int outputWidth = output[3]; + int filterMultiplier = outputChannels / groups_; CHECK_EQ(inputChannels, groups_); // only support strideH() == strideW() and filterHeight == filterWidth. @@ -519,22 +64,19 @@ public: // padding the input float* inputPadding = inputData; + int padInputHeight = inputHeight + 2 * paddingH(); + int padInputWidth = inputWidth + 2 * paddingW(); if (paddingH() > 0 || paddingW() > 0) { - int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) * - (inputWidth + 2 * paddingW()); + int newSize = batchSize * inputChannels * padInputHeight * padInputWidth; resizeBuffer(newSize); inputPadding = reinterpret_cast(memory_->getBuf()); - Padding::run(inputData, - inputPadding, - batchSize * inputChannels, - inputHeight, - inputWidth, - paddingH(), - paddingW()); - - // height and width of padding data - inputHeight += 2 * paddingH(); - inputWidth += 2 * paddingW(); + neon::Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + padInputHeight, + padInputWidth); } std::function::run; + DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run; } else if (filterWidth == 3 && strideW() == 2) { - DepthWiseConv = DepthwiseConvKernel<3, 2>::run; + DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run; } else if (filterWidth == 4 && strideW() == 1) { - DepthWiseConv = DepthwiseConvKernel<4, 1>::run; + DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run; } else if (filterWidth == 4 && strideW() == 2) { - DepthWiseConv = DepthwiseConvKernel<4, 2>::run; + DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run; } else { LOG(FATAL) << "Not supported"; } - for (size_t i = 0; i < batchSize; i++) { + for (int i = 0; i < batchSize; i++) { DepthWiseConv(inputPadding, filterData, - inputHeight, - inputWidth, + padInputHeight, + padInputWidth, outputChannels, outputHeight, outputWidth, filterMultiplier, outputData); - inputPadding += inputChannels * inputHeight * inputWidth; + inputPadding += inputChannels * padInputHeight * padInputWidth; outputData += outputChannels * outputHeight * outputWidth; } } }; +#ifndef PADDLE_TYPE_DOUBLE REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction); +#endif #endif -} // namespace neon } // namespace paddle diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/function/neon/NeonDepthwiseConv.h new file mode 100644 index 0000000000000000000000000000000000000000..aefeea78badbca3d0d09e292e4e1e148618f8ac6 --- /dev/null +++ b/paddle/function/neon/NeonDepthwiseConv.h @@ -0,0 +1,631 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "neon_util.h" + +namespace paddle { + +namespace neon { + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +template +struct DepthwiseConvKernel {}; + +inline float32_t conv3x3(float32x4_t r0, + float32x4_t r1, + float32x4_t r2, + float32x4_t k0, + float32x4_t k1, + float32x4_t k2) { + float32x4_t tmp; + tmp = vmulq_f32(r0, k0); + tmp = vmlaq_f32(tmp, r1, k1); + tmp = vmlaq_f32(tmp, r2, k2); + return vaddvq_f32(tmp); +} + +inline float32_t conv4x4(float32x4_t r0, + float32x4_t r1, + float32x4_t r2, + float32x4_t r3, + float32x4_t k0, + float32x4_t k1, + float32x4_t k2, + float32x4_t k3) { + float32x4_t tmp; + tmp = vmulq_f32(r0, k0); + tmp = vmlaq_f32(tmp, r1, k1); + tmp = vmlaq_f32(tmp, r2, k2); + tmp = vmlaq_f32(tmp, r3, k3); + return vaddvq_f32(tmp); +} + +/** + * Each step calculates four elements of the output. + * First step: + * R0[0, 1, 2, 3...] * K[0][0] + * R0[1, 2, 3, 4...] * K[0][1] + * R0[2, 3, 4, 5...] * K[0][2] + * R1[0, 1, 2, 3...] * K[1][0] + * R1[1, 2, 3, 4...] * K[1][1] + * R1[2, 3, 4, 5...] * K[1][2] + * R2[0, 1, 2, 3...] * K[2][0] + * R2[1, 2, 3, 4...] * K[2][1] + * + R2[2, 3, 4, 5...] * K[2][2] + * ------------------------------ + * Output[0, 1, 2, 3] + */ +template <> +struct DepthwiseConvKernel<3, 1> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 9) { + // Load the filters + float32x4_t k[3]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 3); + k[2] = vld1q_f32(filterData + 6); + k[0] = vsetq_lane_f32(0.f, k[0], 3); + k[1] = vsetq_lane_f32(0.f, k[1], 3); + k[2] = vsetq_lane_f32(0.f, k[2], 3); + + const float* r0 = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + const float* r1 = r0 + inputWidth; + const float* r2 = r0 + inputWidth * 2; + float32x4_t input[3][3]; + for (int h = 0; h < outputHeight; h++) { + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t tmp; + input[0][0] = vld1q_f32(r0); + tmp = vld1q_f32(r0 + 4); + input[0][1] = vextq_f32(input[0][0], tmp, 1); + input[0][2] = vextq_f32(input[0][0], tmp, 2); + input[1][0] = vld1q_f32(r1); + tmp = vld1q_f32(r1 + 4); + input[1][1] = vextq_f32(input[1][0], tmp, 1); + input[1][2] = vextq_f32(input[1][0], tmp, 2); + input[2][0] = vld1q_f32(r2); + tmp = vld1q_f32(r2 + 4); + input[2][1] = vextq_f32(input[2][0], tmp, 1); + input[2][2] = vextq_f32(input[2][0], tmp, 2); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 4; + r1 += 4; + r2 += 4; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); + r0++; + r1++; + r2++; + outputData++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } + } +}; + +/** + * Each step calculates four elements of the output. + * First step: + * R0[0, 2, 4, 6...] * K[0][0] + * R0[1, 3, 5, 7...] * K[0][1] + * R0[2, 4, 6, 8...] * K[0][2] + * R1[0, 2, 4, 6...] * K[1][0] + * R1[1, 3, 5, 7...] * K[1][1] + * R1[2, 4, 6, 8...] * K[1][2] + * R2[0, 2, 4, 6...] * K[2][0] + * R2[1, 3, 5, 7...] * K[2][1] + * R2[2, 4, 6, 8...] * K[2][2] + * ------------------------------ + * Output[0, 1, 2, 3] + */ +template <> +struct DepthwiseConvKernel<3, 2> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 9) { + // Load the filters + float32x4_t k[3]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 3); + k[2] = vld1q_f32(filterData + 6); + k[0] = vsetq_lane_f32(0.f, k[0], 3); + k[1] = vsetq_lane_f32(0.f, k[1], 3); + k[2] = vsetq_lane_f32(0.f, k[2], 3); + + const float* start = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + float32x4_t input[3][3]; + for (int h = 0; h < outputHeight; h++) { + const float* r0 = start + 2 * h * inputWidth; + const float* r1 = start + (2 * h + 1) * inputWidth; + const float* r2 = start + (2 * h + 2) * inputWidth; + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t data1; + float32x4x2_t data2; + + data2 = vld2q_f32(r0); + input[0][0] = data2.val[0]; + input[0][1] = data2.val[1]; + data1 = vld1q_f32(r0 + 8); + input[0][2] = vextq_f32(data2.val[0], data1, 1); + + data2 = vld2q_f32(r1); + input[1][0] = data2.val[0]; + input[1][1] = data2.val[1]; + data1 = vld1q_f32(r1 + 8); + input[1][2] = vextq_f32(data2.val[0], data1, 1); + + data2 = vld2q_f32(r2); + input[2][0] = data2.val[0]; + input[2][1] = data2.val[1]; + data1 = vld1q_f32(r2 + 8); + input[2][2] = vextq_f32(data2.val[0], data1, 1); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 8; + r1 += 8; + r2 += 8; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); + r0 += 2; + r1 += 2; + r2 += 2; + outputData++; + } + } + } + } +}; + +/** + * Each step calculates four elements of the output. + */ +template <> +struct DepthwiseConvKernel<4, 1> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 16) { + // Load the filters + float32x4_t k[4]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 4); + k[2] = vld1q_f32(filterData + 8); + k[3] = vld1q_f32(filterData + 12); + + const float* r0 = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + const float* r1 = r0 + inputWidth; + const float* r2 = r0 + inputWidth * 2; + const float* r3 = r0 + inputWidth * 3; + float32x4_t input[4][4]; + for (int h = 0; h < outputHeight; h++) { + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t tmp; + input[0][0] = vld1q_f32(r0); + tmp = vld1q_f32(r0 + 4); + input[0][1] = vextq_f32(input[0][0], tmp, 1); + input[0][2] = vextq_f32(input[0][0], tmp, 2); + input[0][3] = vextq_f32(input[0][0], tmp, 3); + + input[1][0] = vld1q_f32(r1); + tmp = vld1q_f32(r1 + 4); + input[1][1] = vextq_f32(input[1][0], tmp, 1); + input[1][2] = vextq_f32(input[1][0], tmp, 2); + input[1][3] = vextq_f32(input[1][0], tmp, 3); + + input[2][0] = vld1q_f32(r2); + tmp = vld1q_f32(r2 + 4); + input[2][1] = vextq_f32(input[2][0], tmp, 1); + input[2][2] = vextq_f32(input[2][0], tmp, 2); + input[2][3] = vextq_f32(input[2][0], tmp, 3); + + input[3][0] = vld1q_f32(r3); + tmp = vld1q_f32(r3 + 4); + input[3][1] = vextq_f32(input[3][0], tmp, 1); + input[3][2] = vextq_f32(input[3][0], tmp, 2); + input[3][3] = vextq_f32(input[3][0], tmp, 3); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + float32x4_t i3 = vld1q_f32(r3); + *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); + r0++; + r1++; + r2++; + r3++; + outputData++; + } + + r0 += 3; + r1 += 3; + r2 += 3; + r3 += 3; + } + } + } +}; + +/** + * Each step calculates four elements of the output. + */ +template <> +struct DepthwiseConvKernel<4, 2> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 16) { + // Load the filters + float32x4_t k[4]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 4); + k[2] = vld1q_f32(filterData + 8); + k[3] = vld1q_f32(filterData + 12); + + const float* start = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + float32x4_t input[4][4]; + for (int h = 0; h < outputHeight; h++) { + const float* r0 = start + 2 * h * inputWidth; + const float* r1 = start + (2 * h + 1) * inputWidth; + const float* r2 = start + (2 * h + 2) * inputWidth; + const float* r3 = start + (2 * h + 3) * inputWidth; + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4x2_t data1; + float32x4x2_t data2; + + data1 = vld2q_f32(r0); + data2 = vld2q_f32(r0 + 8); + input[0][0] = data1.val[0]; + input[0][1] = data1.val[1]; + input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r1); + data2 = vld2q_f32(r1 + 8); + input[1][0] = data1.val[0]; + input[1][1] = data1.val[1]; + input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r2); + data2 = vld2q_f32(r2 + 8); + input[2][0] = data1.val[0]; + input[2][1] = data1.val[1]; + input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r3); + data2 = vld2q_f32(r3 + 8); + input[3][0] = data1.val[0]; + input[3][1] = data1.val[1]; + input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 8; + r1 += 8; + r2 += 8; + r3 += 8; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + float32x4_t i3 = vld1q_f32(r3); + *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + outputData++; + } + } + } + } +}; + +template +struct Padding { + static void run(const T* input, + T* inputPadding, + int channels, + int inputHeight, + int inputWidth, + int padInputHeight, + int padInputWidth) { + const int paddingHeight = (padInputHeight - inputHeight) / 2; + const int paddingWidth = (padInputWidth - inputWidth) / 2; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T)); + inputPadding += padInputWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = T(0); + } + + memcpy(inputPadding, input, inputWidth * sizeof(T)); + inputPadding += inputWidth; + input += inputWidth; + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = T(0); + } + } + + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T)); + inputPadding += padInputWidth * paddingHeight; + } + } + } +}; + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +template <> +struct Padding { + static void run(const float* input, + float* inputPadding, + int channels, + int inputHeight, + int inputWidth, + int padInputHeight, + int padInputWidth) { + const int paddingHeight = (padInputHeight - inputHeight) / 2; + const int paddingWidth = (padInputWidth - inputWidth) / 2; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = float(0); + } + + int step = inputWidth >> 2; + int remain = inputWidth & 3; + for (int s = 0; s < step; s++) { + float32x4_t s0 = vld1q_f32(input); + vst1q_f32(inputPadding, s0); + input += 4; + inputPadding += 4; + } + for (int r = 0; r < remain; r++) { + *inputPadding++ = *input++; + } + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = float(0); + } + } + + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; + } + } + } +}; + +// for stride is 2 +struct StridePadding { + static void run(const float* input, + float* inputPadding, + int channels, + int inputHeight, + int inputWidth, + int padInputHeight, + int padInputWidth) { + const int paddingHeight = (padInputHeight - (inputHeight * 2 - 1)) / 2; + const int paddingWidth = (padInputWidth - (inputWidth * 2 - 1)) / 2; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = float(0); + } + + int step = inputWidth >> 2; + int remain = inputWidth & 3; + float32x4_t s1 = vdupq_n_f32(0.f); + for (int s = 0; s < step; s++) { + float32x4_t s0 = vld1q_f32(input); + float32x4x2_t v = {s0, s1}; + vst2q_f32(inputPadding, v); + input += 4; + inputPadding += 8; + } + for (int r = 0; r < remain; r++) { + *inputPadding++ = *input++; + *inputPadding++ = float(0); + } + inputPadding--; + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = float(0); + } + if (i != inputHeight - 1) { + memset(inputPadding, 0, padInputWidth * sizeof(float)); + inputPadding += padInputWidth; + } + } + + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; + } + } + } +}; + +#endif + +#endif + +} // namespace neon +} // namespace paddle diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp new file mode 100644 index 0000000000000000000000000000000000000000..49ca4bc8a0947ba329bd991e9f7d001623901a67 --- /dev/null +++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp @@ -0,0 +1,136 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "NeonDepthwiseConv.h" +#include "paddle/function/ConvOp.h" + +namespace paddle { + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +template +class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + int batchSize = input[0]; + int inputChannels = input[1]; + int inputHeight = input[2]; + int inputWidth = input[3]; + int filterHeight = getFilterHeight(filter); + int filterWidth = getFilterWidth(filter); + int outputChannels = output[1]; + int outputHeight = output[2]; + int outputWidth = output[3]; + int filterMultiplier = outputChannels / groups_; + CHECK_EQ(inputChannels, groups_); + + // only support strideH() == strideW() and filterHeight == filterWidth. + CHECK_EQ(strideH(), strideW()); + CHECK_EQ(paddingH(), paddingW()); + CHECK_EQ(filterHeight, filterWidth); + + float* inputData = inputs[0].data(); + float* filterData = inputs[1].data(); + float* outputData = outputs[0].data(); + + // padding the input, input -> inputPadding + float* inputPadding = inputData; + int padInputHeight = + (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH(); + int padInputWidth = + (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW(); + + if (padInputHeight > inputHeight || padInputWidth > inputWidth) { + int newSize = batchSize * inputChannels * padInputHeight * padInputWidth; + resizeBuffer(newSize); + inputPadding = reinterpret_cast(memory_->getBuf()); + if (strideH() == 1) { + neon::Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + padInputHeight, + padInputWidth); + } else if (strideH() == 2) { + neon::StridePadding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + padInputHeight, + padInputWidth); + } else { + LOG(FATAL) << "Not supported"; + } + } + + std::function + DepthWiseConv; + + if (filterWidth == 3) { + DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run; + } else if (filterWidth == 4) { + DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run; + } else { + LOG(FATAL) << "Not supported"; + } + + for (int i = 0; i < batchSize; i++) { + DepthWiseConv(inputPadding, + filterData, + padInputHeight, + padInputWidth, + outputChannels, + outputHeight, + outputWidth, + filterMultiplier, + outputData); + inputPadding += inputChannels * padInputHeight * padInputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + } +}; + +#ifndef PADDLE_TYPE_DOUBLE + +REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose, + CPU, + NeonDepthwiseConvTransposeFunction); + +#endif + +#endif + +} // namespace paddle diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h index 56b3febe2d27bb4fbf57e49079b3ad071d556914..e2db0450675084345ad55559d8988c5375801cc9 100644 --- a/paddle/function/neon/neon_util.h +++ b/paddle/function/neon/neon_util.h @@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) { return vget_lane_f32(vpadd_f32(v, v), 0); } -inline float32x4_t vmlaq_laneq_f32(float32x4_t a, - float32x4_t b, - float32x4_t v, - const int lane) { - return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); -} +#define vmlaq_laneq_f32(a, b, v, lane) \ + vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)) #endif } // namespace neon diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp index 1ceaaaa206ee3cbc5421238574c7f310011ccaa5..f7a80e23e1bd49549bec57b360587adc6b423794 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.cpp +++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp @@ -62,14 +62,18 @@ void BatchNormBaseLayer::calFeatureMapSize() { const ImageConfig& conf = config_.inputs(0).image_conf(); imageH_ = inputLayers_[0]->getOutput().getFrameHeight(); imageW_ = inputLayers_[0]->getOutput().getFrameWidth(); + imageD_ = inputLayers_[0]->getOutput().getFrameDepth(); + + if (0 == imageD_) imageD_ = conf.img_size_z(); if (imageH_ == 0 && imageW_ == 0) { imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); imageW_ = conf.img_size(); } else { getOutput().setFrameHeight(imageH_); getOutput().setFrameWidth(imageW_); + getOutput().setFrameDepth(imageD_); } - imgPixels_ = imageH_ * imageW_; + imgPixels_ = imageH_ * imageW_ * imageD_; } } // namespace paddle diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h index 230bafc31d96bbd49481a7ed135be6888688627e..e721d2d267a31cae46407673b8b1281e87055608 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.h +++ b/paddle/gserver/layers/BatchNormBaseLayer.h @@ -80,6 +80,7 @@ protected: /// Height or width of input image feature. /// Both of them are 1 if the input is fully-connected layer. + int imageD_; int imageH_; int imageW_; /// Height * Width. diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp index 44ba2c4b7d1562d2ce839b5f4b4de1af35e6925f..49a9540c0b6e36b59ed786287ff5c4569b69a6a5 100644 --- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp +++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp @@ -37,7 +37,7 @@ bool CudnnBatchNormLayer::init(const LayerMap& layerMap, } void CudnnBatchNormLayer::reshape(int batchSize) { - hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_, imageW_); + hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_); } void CudnnBatchNormLayer::forward(PassType passType) { @@ -104,7 +104,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { EPS, batchSize, channels_, - imageH_, + imageH_ * imageD_, imageW_); } } diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp index 06907768e98f4bad952706cffbbd65d1f86cc6df..148516391c6cad8feff34b9bd1c10c27d1a8a0e6 100644 --- a/paddle/gserver/layers/GruCompute.cpp +++ b/paddle/gserver/layers/GruCompute.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "GruCompute.h" #include "hl_recurrent_apply.cuh" +#include "paddle/function/GruFunctor.h" #include "paddle/utils/Util.h" namespace paddle { @@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) { template <> void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) { - hl_cpu_gru_forward(hppl::forward::gru_resetOutput(), - hppl::forward::gru_finalOutput(), - value, - frameSize, - batchSize, - activeNode_, - activeGate_); + GruFunctor::compute(hppl::forward::gru_resetOutput(), + hppl::forward::gru_finalOutput(), + value, + frameSize, + batchSize, + activeNode_, + activeGate_); } template <> @@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value, hl_gru_grad grad, int frameSize, int batchSize) { - hl_cpu_gru_backward(hppl::backward::gru_stateGrad(), - hppl::backward::gru_resetGrad(), - value, - grad, - frameSize, - batchSize, - activeNode_, - activeGate_); + GruGradFunctor::compute( + hppl::backward::gru_stateGrad(), + hppl::backward::gru_resetGrad(), + value, + grad, + frameSize, + batchSize, + activeNode_, + activeGate_); } } // namespace paddle diff --git a/paddle/gserver/layers/SwitchOrderLayer.cpp b/paddle/gserver/layers/SwitchOrderLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7eee6eaf078dab8d48adc4c7ee758a433672ac6 --- /dev/null +++ b/paddle/gserver/layers/SwitchOrderLayer.cpp @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "SwitchOrderLayer.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +REGISTER_LAYER(switch_order, SwitchOrderLayer); + +bool SwitchOrderLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + auto& img_conf = config_.inputs(0).image_conf(); + size_t inD = img_conf.img_size_z(); + size_t inH = + img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(); + size_t inW = img_conf.img_size(); + size_t inC = img_conf.channels(); + inH = inH * inD; + inDims_ = TensorShape({0, inC, inH, inW}); + outDims_ = TensorShape(4); + + auto& reshape_conf = config_.reshape_conf(); + for (int i = 0; i < reshape_conf.height_axis_size(); i++) { + heightAxis_.push_back(reshape_conf.height_axis(i)); + } + for (int i = 0; i < reshape_conf.width_axis_size(); i++) { + widthAxis_.push_back(reshape_conf.width_axis(i)); + } + createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig()); + createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig()); + return true; +} + +void SwitchOrderLayer::setOutDims() { + outDims_.setDim(0, inDims_[0]); + outDims_.setDim(1, inDims_[2]); + outDims_.setDim(2, inDims_[3]); + outDims_.setDim(3, inDims_[1]); + reshapeHeight_ = 1; + for (size_t i = 0; i < heightAxis_.size(); i++) { + reshapeHeight_ *= outDims_[heightAxis_[i]]; + } + output_.setFrameHeight(reshapeHeight_); + reshapeWidth_ = 1; + for (size_t i = 0; i < widthAxis_.size(); i++) { + reshapeWidth_ *= outDims_[widthAxis_[i]]; + } + output_.setFrameWidth(reshapeWidth_); +} + +void SwitchOrderLayer::setInDims() { + MatrixPtr input = inputLayers_[0]->getOutputValue(); + size_t batchSize = input->getHeight(); + inDims_.setDim(0, batchSize); + int d = inputLayers_[0]->getOutput().getFrameDepth(); + d = (d == 0 ? 1 : d); + int h = inputLayers_[0]->getOutput().getFrameHeight(); + if (h != 0) inDims_.setDim(2, h * d); + int w = inputLayers_[0]->getOutput().getFrameWidth(); + if (w != 0) inDims_.setDim(3, w); + int totalCount = input->getElementCnt(); + int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]); + if (channels != 0) inDims_.setDim(1, channels); +} + +void SwitchOrderLayer::forward(PassType passType) { + Layer::forward(passType); + setInDims(); + setOutDims(); + resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]); + if (heightAxis_.size() > 0) { + getOutputValue()->reshape(reshapeHeight_, reshapeWidth_); + getOutputGrad()->reshape(reshapeHeight_, reshapeWidth_); + } + + // switch NCHW to NHWC + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(0), inDims_); + outputs.addArg(*getOutputValue(), outDims_); + nchw2nhwc_[0]->calc(inputs, outputs); + forwardActivation(); +} + +void SwitchOrderLayer::backward(const UpdateCallback& callback) { + (void)callback; + backwardActivation(); + + // switch NHWC to NCHW + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getOutputGrad(), outDims_); + outputs.addArg(*getInputGrad(0), inDims_, ADD_TO); + nhwc2nchw_[0]->calc(inputs, outputs); +} +} // namespace paddle diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/gserver/layers/SwitchOrderLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..47b1f7f73ee783b3eae3c9cfe08b1459cef16a71 --- /dev/null +++ b/paddle/gserver/layers/SwitchOrderLayer.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { + +/** + * \brief This layer calculate softmax in image channel dimension. + */ +class SwitchOrderLayer : public Layer { +public: + explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {} + + ~SwitchOrderLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; + void setInDims(); + void setOutDims(); + +protected: + std::vector> nchw2nhwc_; + std::vector> nhwc2nchw_; + TensorShape inDims_; + TensorShape outDims_; + std::vector heightAxis_; + std::vector widthAxis_; + size_t reshapeHeight_; + size_t reshapeWidth_; +}; +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index a831ffbc73fbd6ad42fa31b2d6d583718474e59b..0e6be2df9ef5f0fae8ed2b0c65ac6c032fe45ab1 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1703,6 +1703,55 @@ TEST(Layer, BatchNormalizationLayer) { #endif } +void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) { + TestConfig config; + const int CHANNELS = 10; + const int IMG_SIZE = 16; + const int IMG_SIZE_Y = 8; + const int IMG_SIZE_Z = 8; + size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z; + config.layerConfig.set_type(type); + config.layerConfig.set_size(size); + config.layerConfig.set_active_type("sigmoid"); + config.biasSize = CHANNELS; + config.inputDefs.push_back({INPUT_DATA, + "layer_0", + /* dim= */ size, + /* paraSize= */ CHANNELS}); + + config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS}); + config.inputDefs.back().isStatic = true; + config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS}); + config.inputDefs.back().isStatic = true; + + LayerInputConfig* input = config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + + ImageConfig* img_conf = input->mutable_image_conf(); + img_conf->set_channels(CHANNELS); + img_conf->set_img_size(IMG_SIZE); + img_conf->set_img_size_y(IMG_SIZE_Y); + img_conf->set_img_size_z(IMG_SIZE_Z); + + testLayerGrad(config, + "batch_norm", + 64, + /* trans= */ trans, + useGpu, + /* useWeight */ true); +} + +TEST(Layer, testBatchNorm3DLayer) { + testBatchNorm3DLayer("batch_norm", false, false); +#ifndef PADDLE_ONLY_CPU + testBatchNorm3DLayer("batch_norm", false, true); + if (hl_get_cudnn_lib_version() >= int(4000)) { + testBatchNorm3DLayer("cudnn_batch_norm", false, true); + } +#endif +} + void testConvOperator(bool isDeconv) { TestConfig config; const int NUM_FILTERS = 16; @@ -2008,6 +2057,31 @@ TEST(Layer, CropLayer) { } } +TEST(Layer, SwitchOrderLayer) { + TestConfig config; + // config input_0 + config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0}); + LayerInputConfig* input = config.layerConfig.add_inputs(); + ImageConfig* img = input->mutable_image_conf(); + img->set_channels(4); + img->set_img_size(16); + img->set_img_size_y(16); + + ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf(); + reshape->add_height_axis(0); + reshape->add_height_axis(1); + reshape->add_height_axis(2); + reshape->add_width_axis(3); + + // config softmax layer + config.layerConfig.set_type("switch_order"); + config.layerConfig.set_name("switchOrderLayer"); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "switch_order", 100, false, useGpu, true); + } +} + vector randSampling(real range, int n) { CHECK_GE(range, n); vector num(range); diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index c8ba1074a1555bbddde7e5f0fb2a046138b27c09..c2f17beeb87942ea681f5d388659c0d280157b26 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -143,6 +144,7 @@ void gemm(const CBLAS_TRANSPOSE transA, C, ldc); } +#endif template <> int getrf(const CBLAS_ORDER order, @@ -182,6 +184,7 @@ int getri(const CBLAS_ORDER order, return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); } +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void axpy(const int n, const float alpha, const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); @@ -201,6 +204,7 @@ template <> double dotProduct(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } +#endif #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 637643838ff433753e0cbb9154ee069c2f7c6d15..e8ea6e37ac527a19c529d1731b94bed970211755 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -40,7 +40,14 @@ extern "C" { #ifndef LAPACK_FOUND extern "C" { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS #include +#else +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; +#endif int LAPACKE_sgetrf( int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int LAPACKE_dgetrf( @@ -56,6 +63,7 @@ int LAPACKE_dgetri( namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, const T beta, T* C, const int ldc); +#endif template int getrf(const CBLAS_ORDER Order, @@ -84,10 +93,21 @@ int getri( const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); template -void axpy(const int n, const T alpha, const T* x, T* y); +void axpy(const int n, const T alpha, const T* x, T* y) { + /// y = y + alpha * x + for (int i = 0; i < n; i++) { + y[i] = y[i] + alpha * x[i]; + } +} template -T dotProduct(const int n, const T* x, const T* y); +T dotProduct(const int n, const T* x, const T* y) { + T result = static_cast(0); + for (int i = 0; i < n; i++) { + result += x[i] * y[i]; + } + return result; +} template void vExp(const int n, const T* a, T* r); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 8bc42571f7c141aa31e18d0504b95b2ed4f0da77..4a2132c8d1bfa329ced575f9b78052bdbfe3e4d5 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" #include "SIMDFunctions.h" @@ -2773,24 +2774,24 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; size_t a_col, b_col, a_row, b_row; - CBLAS_TRANSPOSE a_trans, b_trans; + bool a_trans, b_trans; if (!a->isTransposed()) { a_col = a->getWidth(); a_row = a->getHeight(); - a_trans = CblasNoTrans; + a_trans = false; } else { a_col = a->getHeight(); a_row = a->getWidth(); - a_trans = CblasTrans; + a_trans = true; } if (!b->isTransposed()) { b_col = b->getWidth(); b_row = b->getHeight(); - b_trans = CblasNoTrans; + b_trans = false; } else { b_col = b->getHeight(); b_row = b->getWidth(); - b_trans = CblasTrans; + b_trans = true; } CHECK_EQ(a_col, b_row); @@ -2807,7 +2808,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); - gemm( + BlasGemm::compute( a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 431d4e071072317c8fdfdc4f0d13e7cd4e3d062b..44180bca8bca53e74d71ce7bed3516399c01c81d 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -1616,6 +1616,10 @@ public: }; class CpuMatrix : public Matrix { +private: + MatrixPtr sftmaxSum_; + MatrixPtr sftmaxDot_; + public: CpuMatrix(size_t height, size_t width, bool trans = false); CpuMatrix(real* data, size_t height, size_t width, bool trans = false) diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc index be956bf3b320d6beacdb0d2ca742c3e854194b19..7d9d4fa519d1c690feacbadc5175aeab49082282 100644 --- a/paddle/operators/identity_op.cc +++ b/paddle/operators/identity_op.cc @@ -18,17 +18,20 @@ namespace paddle { namespace operators { -// identity is a alias of scale op. This is also a example for creating a alias -// operator. +// The identity operator is an alias of the scale operator. This is also an +// example for creating an alias for an existing operator. template class IdentityOpMaker : public framework::OpProtoAndCheckerMaker { public: IdentityOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "input tensor of identity op"); - AddOutput("Out", "output tensor of identity op"); - AddComment("identity operator. Just a alias of scale op which scale = 1.0"); + AddInput("X", "The input tensor of identity operator."); + AddOutput("Out", "The output tensor of identity operator."); + AddComment(R"DOC( +The identity operator is an alias of the scale operator +with the attribute scale fixed to 1.0. +)DOC"); } }; diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index ed51d416ed9497eee45ba826ad672b8fb1ad3678..f8333f34f7b4c7b0f9a0f14a7a33f9d98e1d331c 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,8 +1,10 @@ if(WITH_GPU) - nv_library(math_function SRCS math_function.cc math_function.cu DEPS cblas device_context) + nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc + im2col.cu DEPS cblas device_context) else() - cc_library(math_function SRCS math_function.cc DEPS cblas device_context) + cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context) endif() nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) +cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc new file mode 100644 index 0000000000000000000000000000000000000000..5727c1cab16c1379ffe77f5594c057e93a042785 --- /dev/null +++ b/paddle/operators/math/im2col.cc @@ -0,0 +1,260 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/im2col.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const framework::Tensor& im, framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int output_height = col.dims()[3]; + int output_width = col.dims()[4]; + int channels_col = input_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / filter_width / filter_height; + for (int h = 0; h < output_height; ++h) { + for (int w = 0; w < output_width; ++w) { + int im_row_idx = h * stride_height + h_offset; + int im_col_idx = w * stride_width + w_offset; + if ((im_row_idx - padding_height) < 0 || + (im_row_idx - padding_height) >= input_height || + (im_col_idx - padding_width) < 0 || + (im_col_idx - padding_width) >= input_width) { + col_data[(c * output_height + h) * output_width + w] = T(0); + } else { + im_row_idx += c_im * input_height - padding_height; + im_col_idx -= padding_width; + col_data[(c * output_height + h) * output_width + w] = + im_data[im_row_idx * input_width + im_col_idx]; + } + } + } + } + } +}; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Col2ImFunctor { + public: + void operator()(framework::Tensor& im, const framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int output_height = col.dims()[3]; + int output_width = col.dims()[4]; + int channels_col = input_channels * filter_height * filter_width; + + T* im_data = im.data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / filter_width / filter_height; + for (int h = 0; h < output_height; ++h) { + for (int w = 0; w < output_width; ++w) { + int im_row_idx = h * stride_height + h_offset; + int im_col_idx = w * stride_width + w_offset; + if ((im_row_idx - padding_height) >= 0 && + (im_row_idx - padding_height) < input_height && + (im_col_idx - padding_width) >= 0 && + (im_col_idx - padding_width) < input_width) { + im_row_idx += c_im * input_height - padding_height; + im_col_idx -= padding_width; + im_data[im_row_idx * input_width + im_col_idx] += + col_data[(c * output_height + h) * output_width + w]; + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const framework::Tensor& im, framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int output_height = col.dims()[0]; + int output_width = col.dims()[1]; + + const T* im_data = im.data(); + T* col_data = col.data(); + + for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) { + for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) { + for (int channel = 0; channel < input_channels; ++channel) { + for (int filter_row_idx = 0; filter_row_idx < filter_height; + ++filter_row_idx) { + for (int filter_col_idx = 0; filter_col_idx < filter_width; + ++filter_col_idx) { + int im_row_offset = + col_row_idx * stride_height + filter_row_idx - padding_height; + int im_col_offset = + col_col_idx * stride_width + filter_col_idx - padding_width; + int col_offset = (((col_row_idx * output_width + col_col_idx) * + input_channels + + channel) * + filter_height + + filter_row_idx) * + filter_width + + filter_col_idx; + if (im_row_offset < 0 || im_row_offset >= input_height || + im_col_offset < 0 || im_col_offset >= input_width) { + col_data[col_offset] = T(0); + } else { + int im_offset = + (channel * input_height + im_row_offset) * input_width + + im_col_offset; + col_data[col_offset] = im_data[im_offset]; + } + } + } + } + } + } + } +}; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Col2ImFunctor { + public: + void operator()(framework::Tensor& im, const framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int output_height = col.dims()[0]; + int output_width = col.dims()[1]; + + T* im_data = im.data(); + const T* col_data = col.data(); + + for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) { + for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) { + for (int channel = 0; channel < input_channels; ++channel) { + for (int filter_row_idx = 0; filter_row_idx < filter_height; + ++filter_row_idx) { + for (int filter_col_idx = 0; filter_col_idx < filter_width; + ++filter_col_idx) { + int im_row_offset = + col_row_idx * stride_height + filter_row_idx - padding_height; + int im_col_offset = + col_col_idx * stride_width + filter_col_idx - padding_width; + int col_offset = (((col_row_idx * output_width + col_col_idx) * + input_channels + + channel) * + filter_height + + filter_row_idx) * + filter_width + + filter_col_idx; + if (im_row_offset >= 0 && im_row_offset < input_height && + im_col_offset >= 0 && im_col_offset < input_width) { + int im_offset = + (channel * input_height + im_row_offset) * input_width + + im_col_offset; + im_data[im_offset] += col_data[col_offset]; + } + } + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu new file mode 100644 index 0000000000000000000000000000000000000000..9bff7bee3c95093852305d392af0949b831e5665 --- /dev/null +++ b/paddle/operators/math/im2col.cu @@ -0,0 +1,374 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/im2col.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void im2col(const T* data_im, int num_outs, int height, int width, + int filter_height, int filter_width, int stride_height, + int stride_width, int padding_height, int padding_width, + int output_height, int output_width, T* data_col) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < num_outs) { + int w_out = index % output_width; + index /= output_width; + int h_out = index % output_height; + int channel_in = index / output_height; + int channel_out = channel_in * filter_height * filter_width; + int h_in = h_out * stride_height; + int w_in = w_out * stride_width; + + data_col += (channel_out * output_height + h_out) * output_width + w_out; + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + int rIdx = int(h_in + i); + int cIdx = int(w_in + j); + if ((rIdx - (int)padding_height) >= (int)height || + (rIdx - (int)padding_height) < 0 || + (cIdx - (int)padding_width) >= (int)width || + (cIdx - (int)padding_width) < 0) { + *data_col = 0; + } else { + rIdx = rIdx + channel_in * height - padding_height; + cIdx = cIdx - padding_width; + *data_col = data_im[rIdx * width + cIdx]; + } + data_col += output_height * output_width; + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const framework::Tensor& im, framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int output_height = col.dims()[3]; + int output_width = col.dims()[4]; + + int num_outputs = input_channels * output_height * output_width; + int blocks = (num_outputs + 1024 - 1) / 1024; + int block_x = 512; + int block_y = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(block_x, block_y); + im2col<<< + grid, threads, 0, + reinterpret_cast(context)->stream()>>>( + im.data(), num_outputs, input_height, input_width, filter_height, + filter_width, stride_height, stride_width, padding_height, + padding_width, output_height, output_width, col.data()); + } +}; + +template +__global__ void col2im(size_t n, const T* data_col, size_t height, size_t width, + size_t channels, size_t filter_height, + size_t filter_width, size_t stride_height, + size_t stride_width, size_t padding_height, + size_t padding_width, size_t output_height, + size_t output_width, T* data_im) { + size_t index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < n) { + T val = 0; + int w = int(index % width); + int h = int((index / width) % height); + int c = int(index / (width * height)); + if ((w - (int)padding_width) >= 0 && + (w - (int)padding_width) < (width - 2 * padding_width) && + (h - (int)padding_height) >= 0 && + (h - padding_height) < (height - 2 * padding_height)) { + // compute the start and end of the output + int w_col_start = (w < (int)filter_width) + ? 0 + : (w - int(filter_width)) / (int)stride_width + 1; + int w_col_end = + min((int)(w / (int)stride_width + 1), (int)(output_width)); + int h_col_start = (h < (int)filter_height) + ? 0 + : (h - (int)filter_height) / (int)stride_height + 1; + int h_col_end = min(int(h / stride_height + 1), int(output_height)); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + // the col location: [c * width * height + h_out, w_out] + int c_col = int(c * filter_height * filter_width) + + (h - h_col * (int)stride_height) * (int)filter_width + + (w - w_col * (int)stride_width); + val += + data_col[(c_col * output_height + h_col) * output_width + w_col]; + } + } + h -= padding_height; + w -= padding_width; + data_im[c * ((width - 2 * padding_width) * + (height - 2 * padding_height)) + + h * (width - 2 * padding_width) + w] += val; + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Col2ImFunctor { + public: + void operator()(framework::Tensor& im, const framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int output_height = col.dims()[3]; + int output_width = col.dims()[4]; + + size_t num_kernels = input_channels * (input_height + 2 * padding_height) * + (input_width + 2 * padding_width); + + size_t blocks = (num_kernels + 1024 - 1) / 1024; + size_t block_x = 512; + size_t block_y = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(block_x, block_y); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im<<< + grid, threads, 0, + reinterpret_cast(context)->stream()>>>( + num_kernels, col.data(), input_height + 2 * padding_height, + input_width + 2 * padding_width, input_channels, filter_height, + filter_width, stride_height, stride_width, padding_height, + padding_width, output_height, output_width, im.data()); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +template +__global__ void im2colOCF(const T* im_data, T* col_data, int input_channels, + int input_height, int input_width, int filter_height, + int filter_width, int stride_height, int stride_width, + int padding_height, int padding_width, + int output_height, int output_width) { + int swid = blockIdx.x; + int shid = blockIdx.y; + for (int channelid = threadIdx.z; channelid < input_channels; + channelid += blockDim.z) { + for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { + int width_offset = idx + swid * stride_width - padding_width; + int height_offset = idy + shid * stride_height - padding_height; + int im_offset = width_offset + height_offset * input_width + + channelid * input_height * input_width; + + int col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * output_width + swid) * + (input_channels * filter_height * filter_width); + + if (height_offset >= input_height || height_offset < 0 || + width_offset >= input_width || width_offset < 0) { + col_data[col_offset] = T(0); + } else { + col_data[col_offset] = im_data[im_offset]; + } + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const framework::Tensor& im, framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int output_height = col.dims()[0]; + int output_width = col.dims()[1]; + + int block_dim_x = 0; + int block_dim_y = 0; + if (filter_height <= 4 && filter_width <= 4) { + block_dim_x = 4; + block_dim_y = 4; + } else if (filter_height <= 8 && filter_width <= 8) { + block_dim_x = 8; + block_dim_y = 8; + } else if (filter_height <= 16 && filter_width <= 16) { + block_dim_x = 16; + block_dim_y = 16; + } else { + block_dim_x = 32; + block_dim_y = 32; + } + + int block_dim_z = 1024 / block_dim_x / block_dim_y; + dim3 threads(block_dim_x, block_dim_y, + std::min(block_dim_z, input_channels)); + dim3 grid(output_width, output_height); + im2colOCF<<< + grid, threads, 0, + reinterpret_cast(context)->stream()>>>( + im.data(), col.data(), input_channels, input_height, input_width, + filter_height, filter_width, stride_height, stride_width, + padding_height, padding_width, output_height, output_width); + } +}; + +template +__global__ void col2imOCF(T* im_data, const T* col_data, int input_channels, + int input_height, int input_width, int filter_height, + int filter_width, int stride_height, int stride_width, + int padding_height, int padding_width, + int output_height, int output_width) { + int swid = blockIdx.x; + int shid = blockIdx.y; + for (int channelid = threadIdx.z; channelid < input_channels; + channelid += blockDim.z) { + for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { + int width_offset = idx + swid * stride_width - padding_width; + int height_offset = idy + shid * stride_height - padding_height; + int im_offset = width_offset + height_offset * input_width + + channelid * input_height * input_width; + + int col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * output_width + swid) * + (input_channels * filter_height * filter_width); + + if (height_offset >= 0 && height_offset < input_height && + width_offset >= 0 && width_offset < input_width) { + paddle::platform::CudaAtomicAdd(im_data + im_offset, + col_data[col_offset]); + } + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Col2ImFunctor { + public: + void operator()(framework::Tensor& im, const framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int input_channels = im.dims()[0]; + int input_height = im.dims()[1]; + int input_width = im.dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int output_height = col.dims()[0]; + int output_width = col.dims()[1]; + + int block_dim_x = 0; + int block_dim_y = 0; + if (filter_height <= 4 && filter_width <= 4) { + block_dim_x = 4; + block_dim_y = 4; + } else if (filter_height <= 8 && filter_width <= 8) { + block_dim_x = 8; + block_dim_y = 8; + } else if (filter_height <= 16 && filter_width <= 16) { + block_dim_x = 16; + block_dim_y = 16; + } else { + block_dim_x = 32; + block_dim_y = 32; + } + + int block_dim_z = 1024 / block_dim_x / block_dim_y; + dim3 threads(block_dim_x, block_dim_y, + std::min(block_dim_z, input_channels)); + dim3 grid(output_width, output_height); + col2imOCF<<< + grid, threads, 0, + reinterpret_cast(context)->stream()>>>( + im.data(), col.data(), input_channels, input_height, input_width, + filter_height, filter_width, stride_height, stride_width, + padding_height, padding_width, output_height, output_width); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h new file mode 100644 index 0000000000000000000000000000000000000000..8958c5457cc2c3034c34ca82fb2e98cc06be63c5 --- /dev/null +++ b/paddle/operators/math/im2col.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ +enum class ColFormat { kCFO = 0, kOCF = 1 }; + +/* + * \brief Converts the image data of three dimensions(CHW) into a colData of + * five dimensions in the Im2ColFunctor calculation, + * And in the Col2ImFunctor calculation, it is reversed. + * + * \param imData Image data. + * \param imShape The shape of imData, + * [input_channels, input_height, input_width]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * If the template argument Format is kCFO, the shape of colData is: + * [input_channels, filter_height, filter_width, output_height, output_width] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * input_channels * filter_height * filter_width, and the width is equal + * output_height * output_width. + * + * Reshape: + * shape of colData shape of convolution matrix + * [input_channels, + * filter_height, + * filter_width, ======> [height, width] + * output_height, + * output_width] + * + * If the template argument Format is kOCF, the shape of colData is: + * [output_height, output_width, input_channels, filter_height, filter_width] + * So, it is easy to reshape into a sequence matrix for rnn calculation. + * The shape of sequence matrix is [seq_length, step_size], where the seq_length + * is equal output_height * output_width, and the step_size is equal + * input_channels * filter_height * filter_width. + * + * Reshape: + * shape of colData shape of sequence matrix + * [output_height, + * output_width, + * input_channels, ======> [seqLength, stepSize] + * filter_height, + * filter_width] + * + * \note The caller needs to ensure that imShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Im2ColFunctor { + public: + void operator()(const framework::Tensor& im, framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context); +}; + +template +class Col2ImFunctor { + public: + void operator()(framework::Tensor& im, const framework::Tensor& col, + int stride_height, int stride_width, int padding_height, + int padding_width, platform::DeviceContext* context); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..186a33edcec88bd5e51091a524a778eeb27ad526 --- /dev/null +++ b/paddle/operators/math/im2col_test.cc @@ -0,0 +1,122 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/im2col.h" +#include +#include + +template +void testIm2col() { + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor input; + paddle::framework::Tensor output_cfo; + paddle::framework::Tensor output_ocf; + paddle::framework::Tensor output_tmp; + + /** + * input = [0, 1, 2, + * 3, 4, 5] + * + * output_cfo = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5] + * + * output_ocf = [0, 1, 3, 4 + * 1, 2, 4, 5] + */ + int input_height = 2; + int input_width = 3; + int filter_size = 2; + int stride = 1; + int padding = 0; + int output_height = (input_height - filter_size + 2 * padding) / stride + 1; + int output_width = (input_width - filter_size + 2 * padding) / stride + 1; + float* input_ptr = input_tmp.mutable_data( + {1, input_height, input_width}, paddle::platform::CPUPlace()); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input_ptr, arr, 6 * sizeof(float)); + + auto* place = new Place(); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + input.CopyFrom(input_tmp, *place); + } + output_cfo.mutable_data( + {1, filter_size, filter_size, output_height, output_width}, *place); + output_ocf.mutable_data( + {output_height, output_width, 1, filter_size, filter_size}, *place); + + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, Place, float> + im2col; + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kOCF, Place, float> + im2col_ocf; + + paddle::platform::DeviceContext* context; + if (paddle::platform::is_cpu_place(*place)) { + context = + new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); + } else { +#ifndef PADDLE_ONLY_CPU + context = + new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); +#else + PADDLE_THROW("no GPU support"); +#endif // PADDLE_ONLY_CPU + } + im2col(input, output_cfo, stride, stride, padding, padding, context); + im2col_ocf(input, output_ocf, stride, stride, padding, padding, context); + + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output_cfo.data(); + } else { + output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace()); + out_cfo_ptr = output_tmp.data(); + } + EXPECT_EQ(out_cfo_ptr[0], 0); + EXPECT_EQ(out_cfo_ptr[1], 1); + EXPECT_EQ(out_cfo_ptr[2], 1); + EXPECT_EQ(out_cfo_ptr[3], 2); + EXPECT_EQ(out_cfo_ptr[4], 3); + EXPECT_EQ(out_cfo_ptr[5], 4); + EXPECT_EQ(out_cfo_ptr[6], 4); + EXPECT_EQ(out_cfo_ptr[7], 5); + + float* out_ocf_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_ocf_ptr = output_ocf.data(); + } else { + output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace()); + out_ocf_ptr = output_tmp.data(); + } + EXPECT_EQ(out_ocf_ptr[0], 0); + EXPECT_EQ(out_ocf_ptr[1], 1); + EXPECT_EQ(out_ocf_ptr[2], 3); + EXPECT_EQ(out_ocf_ptr[3], 4); + EXPECT_EQ(out_ocf_ptr[4], 1); + EXPECT_EQ(out_ocf_ptr[5], 2); + EXPECT_EQ(out_ocf_ptr[6], 4); + EXPECT_EQ(out_ocf_ptr[7], 5); +} + +TEST(math, im2col) { + testIm2col(); +#ifndef PADDLE_ONLY_CPU + testIm2col(); +#endif +} \ No newline at end of file diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 28a47cdff2e9b7a965ff9f99e787bb8315010823..710a56a0e8e2d17162d7d000df226f1537104eb9 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -25,18 +25,27 @@ class MulOp : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - auto dim0 = ctx.Input("X")->dims(); - auto dim1 = ctx.Input("Y")->dims(); - PADDLE_ENFORCE_EQ(dim0.size(), 2, - "input X(%s) should be a tensor with 2 dims, a matrix", - ctx.op().Input("X")); - PADDLE_ENFORCE_EQ(dim1.size(), 2, - "input Y(%s) should be a tensor with 2 dims, a matrix", - ctx.op().Input("Y")); + auto x_dims = ctx.Input("X")->dims(); + auto y_dims = ctx.Input("Y")->dims(); + int x_num_col_dims = Attr("x_num_col_dims"); + int y_num_col_dims = Attr("y_num_col_dims"); + + PADDLE_ENFORCE(x_dims.size() > x_num_col_dims, + "The rank of input tensor X(%s) should be larger than " + "`mul_op`'s `x_num_col_dims`.", + ctx.op().Input("X")); + PADDLE_ENFORCE(y_dims.size() > y_num_col_dims, + "The rank of input tensor Y(%s) should be larger than " + "`mul_op`'s `y_num_col_dims`.", + ctx.op().Input("Y")); + + auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); + auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); + PADDLE_ENFORCE_EQ( - dim0[1], dim1[0], + x_mat_dims[1], y_mat_dims[0], "First matrix's width must be equal with second matrix's height."); - ctx.Output("Out")->Resize({dim0[0], dim1[1]}); + ctx.Output("Out")->Resize({x_mat_dims[0], y_mat_dims[1]}); } }; @@ -47,6 +56,23 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The first input of mul op"); AddInput("Y", "The second input of mul op"); AddOutput("Out", "The output of mul op"); + AddAttr( + "x_num_col_dims", + R"DOC(mul_op can take tensors with more than two dimensions as input `X`, + in that case, tensors will be reshaped to a matrix. The matrix's first + dimension(column length) will be the product of tensor's last + `num_col_dims` dimensions, and the matrix's second dimension(row length) + will be the product of tensor's first `rank - num_col_dims` dimensions. + )DOC") + .SetDefault(1) + .EqualGreaterThan(1); + AddAttr( + "y_num_col_dims", + R"DOC(mul_op can take tensors with more than two dimensions as input `Y`, + in that case, tensors will be reshaped to a matrix. Just like input `X`. + )DOC") + .SetDefault(1) + .EqualGreaterThan(1); AddComment(R"DOC( Two Element Mul Operator. @@ -70,10 +96,20 @@ class MulOpGrad : public framework::OperatorWithKernel { auto out_dims = ctx.Input(framework::GradVarName("Out"))->dims(); auto *x_grad = ctx.Output(framework::GradVarName("X")); auto *y_grad = ctx.Output(framework::GradVarName("Y")); - PADDLE_ENFORCE(x_dims[0] == out_dims[0], - "Out@GRAD M X N must equal to X dims 0, M "); - PADDLE_ENFORCE(y_dims[1] == out_dims[1], - "Out@GRAD M X N must equal to Y dims 1, N "); + + auto x_mat_dims = + framework::flatten_to_2d(x_dims, Attr("x_num_col_dims")); + auto y_mat_dims = + framework::flatten_to_2d(y_dims, Attr("y_num_col_dims")); + + PADDLE_ENFORCE_EQ( + x_mat_dims[0], out_dims[0], + "The first dimension of Out@GRAD must equal to the first dimension of " + "the first operand."); + PADDLE_ENFORCE_EQ( + y_mat_dims[1], out_dims[1], + "The second dimension of Out@GRAD must equal to the second " + "dimension of the second operand."); if (x_grad) x_grad->Resize(x_dims); if (y_grad) y_grad->Resize(y_dims); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 05a79e13b3470e39a5ebd0394ba05629553a5075..3c01f868bda8cba488b3403df456d63d6b082fa6 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -1,7 +1,7 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. + You may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -31,13 +31,25 @@ template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); + const Tensor* x = context.Input("X"); + const Tensor* y = context.Input("Y"); + Tensor* z = context.Output("Out"); + const Tensor x_matrix = + x->dims().size() > 2 + ? framework::ReshapeToMatrix( + *x, context.template Attr("x_num_col_dims")) + : *x; + const Tensor y_matrix = + y->dims().size() > 2 + ? framework::ReshapeToMatrix( + *y, context.template Attr("y_num_col_dims")) + : *y; + z->mutable_data(context.GetPlace()); auto* device_context = const_cast(context.device_context_); - math::matmul(*x, false, *y, false, 1, z, 0, device_context); + math::matmul(x_matrix, false, y_matrix, false, 1, z, 0, + device_context); } }; @@ -45,23 +57,39 @@ template class MulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); + int x_num_col_dims = ctx.template Attr("x_num_col_dims"); + int y_num_col_dims = ctx.template Attr("y_num_col_dims"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor x_matrix = + x->dims().size() > 2 ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = + y->dims().size() > 2 ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; + const Tensor* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); + Tensor* dx = ctx.Output(framework::GradVarName("X")); + Tensor* dy = ctx.Output(framework::GradVarName("Y")); auto* device_context = const_cast(ctx.device_context_); if (dx) { dx->mutable_data(ctx.GetPlace()); + Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( + *dx, x_num_col_dims) + : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(*dout, false, *y, true, 1, dx, 0, device_context); + math::matmul(*dout, false, y_matrix, true, 1, &dx_matrix, 0, + device_context); } if (dy) { dy->mutable_data(ctx.GetPlace()); + Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix( + *dy, y_num_col_dims) + : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(*x, true, *dout, false, 1, dy, 0, device_context); + math::matmul(x_matrix, true, *dout, false, 1, &dy_matrix, 0, + device_context); } } }; diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index 30b4b404315a9f041e21d79b75fd06307e33f7f9..fa8f0ff1a858143af427b51025279c726f1628e0 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -25,14 +25,19 @@ class RowwiseAddOp : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - auto dim0 = ctx.Input("X")->dims(); - auto dim1 = ctx.Input("b")->dims(); - - PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix"); - PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector"); - PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same"); - PADDLE_ENFORCE(ctx.OutputSize("Out") == 1, "The output size must be 1"); - ctx.Output("Out")->Resize(ctx.Input("X")->dims()); + auto x_dims = ctx.Input("X")->dims(); + auto b_dims = ctx.Input("b")->dims(); + PADDLE_ENFORCE_GT( + x_dims.size(), b_dims.size(), + "The rank of input `X` must be larger than the one of input `b`."); + + int num_col_dims = x_dims.size() - b_dims.size(); + + PADDLE_ENFORCE_EQ( + framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims, + "The width of two operands must be same"); + PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1"); + ctx.Output("Out")->Resize(x_dims); } }; @@ -61,13 +66,20 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null"); PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null"); - auto dims0 = ctx.Input("X")->dims(); - auto dims1 = ctx.Input("b")->dims(); - PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1") + auto x_dims = ctx.Input("X")->dims(); + auto b_dims = ctx.Input("b")->dims(); + PADDLE_ENFORCE_GT( + x_dims.size(), b_dims.size(), + "The rank of input `X` must be larger than the one of input `b`."); + + int num_col_dims = x_dims.size() - b_dims.size(); + PADDLE_ENFORCE_EQ( + framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims, + "The width of two operands must be same"); auto *dx = ctx.Output(framework::GradVarName("X")); auto *db = ctx.Output(framework::GradVarName("b")); - if (dx) dx->Resize(dims0); - if (db) db->Resize(dims1); + if (dx) dx->Resize(x_dims); + if (db) db->Resize(b_dims); } }; diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index 4e926d9f2947f37b71e81c0fa592b0c66b19c640..35774b940926f77167b8f19597027e74d3477e5b 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -33,10 +33,12 @@ class RowwiseAddKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto out = context.Output("Out"); out->mutable_data(context.GetPlace()); - - auto input = EigenMatrix::From(*context.Input("X")); - auto bias = EigenVector::From(*context.Input("b")); - auto output = EigenMatrix::From(*out); + int num_col_dims = context.Input("X")->dims().size() - + context.Input("b")->dims().size(); + auto input = + EigenMatrix::Reshape(*context.Input("X"), num_col_dims); + auto bias = EigenVector::Flatten(*context.Input("b")); + auto output = EigenMatrix::Reshape(*out, num_col_dims); const int bias_size = bias.dimension(0); const int rest_size = input.size() / bias_size; @@ -54,12 +56,15 @@ class RowwiseAddGradKernel : public framework::OpKernel { auto* dout = context.Input(framework::GradVarName("Out")); auto* dx = context.Output(framework::GradVarName("X")); auto* db = context.Output(framework::GradVarName("b")); + int num_col_dims = context.Input("X")->dims().size() - + context.Input("b")->dims().size(); - auto out_grad = EigenMatrix::From(*dout); + auto out_grad = EigenMatrix::Reshape(*dout, num_col_dims); auto place = context.GetEigenDevice(); + if (dx) { dx->mutable_data(context.GetPlace()); - EigenMatrix::From(*dx).device(place) = out_grad; + EigenMatrix::Reshape(*dx, num_col_dims).device(place) = out_grad; } if (db) { diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc index 3d82b345829b0a554a204ada91c807e42b71dc58..ea991f683d841b3dc4624a0d8aa3c88367fd3c6d 100644 --- a/paddle/operators/scale_op.cc +++ b/paddle/operators/scale_op.cc @@ -44,11 +44,13 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { The equation is: Out = scale*X )DOC"); - AddAttr("scale", "scale of scale operator.").SetDefault(1.0); + AddAttr("scale", "The scaling factor of the scale operator.") + .SetDefault(1.0); } }; -// Scale Op's gradient is scale op, too. +// The operator to calculate gradients of a scale operator is just the scale +// operator itself. // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out)) template class ScaleGradOp : public NetOp { diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 7d062ad67c048bc6bef68121f86334eb3f1efe92..7166b2f60be8a6088ab3a81686f7bed1b7181d97 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -51,7 +51,7 @@ the other dimensions in the K-dimensional vector input. Then the ratio of the exponential of the given dimension and the sum of exponential values of all the other dimensions is the output of the softmax operator. -For each row `i` and each column `j` in X, we have: +For each row `i` and each column `j` in input X, we have: Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j])) )DOC"); @@ -64,14 +64,15 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should be not null."); PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")), - "Input(Y@GRAD) should not be null"); - PADDLE_ENFORCE(ctx.Input("Y")->dims() == - ctx.Input(framework::GradVarName("Y"))->dims(), - "the shape of Input(0) and Input(1) should be the same"); + "Input(Y@GRAD) should be not null."); + PADDLE_ENFORCE_EQ(ctx.Input("Y")->dims(), + ctx.Input(framework::GradVarName("Y"))->dims(), + "Input(Y) and its gradients should have a same shape."); + ctx.Output(framework::GradVarName("X")) - ->Resize(ctx.Input("Y")->dims()); + ->Resize(ctx.Input("X")->dims()); } }; diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 4fa6b59540498638c3b7df639ae10a66c0fa1c16..8a3a5ab927c0e2937936fcc973f000d4d95c3dbc 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -28,12 +28,12 @@ template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto input = context.Input("X"); - auto output = context.Output("Y"); - output->mutable_data(context.GetPlace()); + auto X = context.Input("X"); + auto Y = context.Output("Y"); + Y->mutable_data(context.GetPlace()); - auto logits = EigenMatrix::From(*input); - auto softmax = EigenMatrix::From(*output); + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); const int kBatchDim = 0; const int kClassDim = 1; diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..38d2f0a09aec751734864947a2f3cfa20107e22f --- /dev/null +++ b/paddle/operators/top_k_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/top_k_op.h" + +namespace paddle { +namespace operators { + +class TopkOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), + "Input of TopkOP must be initialized."); + auto *input = ctx.Input("X"); + const int k = static_cast(ctx.Attr("k")); + + PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); + PADDLE_ENFORCE_GE(input->dims().size(), 1, "input must have >= 1d shape"); + PADDLE_ENFORCE_GE(input->dims()[input->dims().size() - 1], k, + "input must have >= k columns"); + + framework::DDim dims = input->dims(); + dims[dims.size() - 1] = k; + ctx.Output("Out")->Resize(dims); + ctx.Output("Indices")->Resize(dims); + } +}; + +class TopkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of Topk op"); + AddOutput("Out", "The output tensor of Topk op"); + AddOutput("Indices", "The indices of Topk elements of input"); + AddComment( + R"DOC(If the input is a vector (1d tensor), finds the k largest entries in the vector and outputs their values and indices as vectors. Thus values[j] is the j-th largest entry in input, and its index is indices[j]. + + For matrices, computes the top k entries in each row. )DOC"); + AddAttr("k", + "Number of top elements to look for along the last " + "dimension (along each row for matrices).") + .SetDefault(1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(top_k, ops::TopkOp, ops::TopkOpMaker); +REGISTER_OP_CPU_KERNEL(top_k, + ops::TopkKernel); diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..afe4d149c53819c45e20353bc9d16393f3f61e0f --- /dev/null +++ b/paddle/operators/top_k_op.cu @@ -0,0 +1,318 @@ +/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/op_registry.h" +#include "paddle/platform/assert.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Pair { + __device__ __forceinline__ Pair() {} + __device__ __forceinline__ Pair(T value, int id) : v(value), id(id) {} + + __device__ __forceinline__ void set(T value, int id) { + v = value; + id = id; + } + + __device__ __forceinline__ void operator=(const Pair& in) { + v = in.v; + id = in.id; + } + + __device__ __forceinline__ bool operator<(const T value) const { + return (v < value); + } + + __device__ __forceinline__ bool operator<(const Pair& in) const { + return (v < in.v) || ((v == in.v) && (id > in.id)); + } + + __device__ __forceinline__ bool operator>(const Pair& in) const { + return (v > in.v) || ((v == in.v) && (id < in.id)); + } + + T v; + int id; +}; + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, + int beam_size) { + for (int k = beam_size - 2; k >= 0; k--) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p) { + for (int k = beam_size - 2; k >= 0; k--) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size); + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, const Pair& max, + int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + if (tmp < max) { + AddTo(topk, tmp, beam_size); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, + int idx, int dim, int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < val[idx]) { + Pair tmp(val[idx], col[idx]); + AddTo(topk, tmp, beam_size); + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, + int idx, int dim, const Pair& max, + int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < val[idx]) { + Pair tmp(val[idx], col[idx]); + if (tmp < max) { + AddTo(topk, tmp, beam_size); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, + int beam_size, const T* src, + bool& firstStep, bool& is_empty, + Pair& max, int dim, + const int tid) { + if (beam > 0) { + int length = beam < beam_size ? beam : beam_size; + if (firstStep) { + firstStep = false; + GetTopK(topk, src, tid, dim, length); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - beam) { + topk[k] = topk[k + beam]; + } else { + topk[k].set(-INFINITY, -1); + } + } + if (!is_empty) { + GetTopK(topk + MaxLength - beam, src, tid, dim, max, + length); + } + } + + max = topk[MaxLength - 1]; + if (max.v == -1) is_empty = true; + beam = 0; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, + int beam_size, const T* val, + int* col, bool& firstStep, + bool& is_empty, Pair& max, + int dim, const int tid) { + if (beam > 0) { + int length = beam < beam_size ? beam : beam_size; + if (firstStep) { + firstStep = false; + GetTopK(topk, val, col, tid, dim, length); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - beam) { + topk[k] = topk[k + beam]; + } else { + topk[k].set(-INFINITY, -1); + } + } + if (!is_empty) { + GetTopK(topk + MaxLength - beam, val, col, tid, dim, max, + length); + } + } + + max = topk[MaxLength - 1]; + if (max.v == -1) is_empty = true; + beam = 0; + } +} + +template +__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, + Pair topk[], T** topVal, + int** topIds, int& beam, int& k, + const int tid, const int warp) { + while (true) { + __syncthreads(); + if (tid < BlockSize / 2) { + if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } + __syncthreads(); + for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { + if (tid < stride) { + if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } + __syncthreads(); + } + __syncthreads(); + + if (tid == 0) { + **topVal = sh_topk[maxid[0]].v; + **topIds = sh_topk[maxid[0]].id; + (*topVal)++; + (*topIds)++; + } + if (tid == maxid[0]) beam++; + if (--k == 0) break; + __syncthreads(); + + if (tid == maxid[0]) { + if (beam < MaxLength) { + sh_topk[tid] = topk[beam]; + } + } + if (maxid[0] / 32 == warp) { + if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; + } + } +} + +/** + * Each block compute one sample. + * In a block: + * 1. every thread get top MaxLength value; + * 2. merge to sh_topk, block reduce and get max value; + * 3. go to the second setp, until one thread's topk value is null; + * 4. go to the first setp, until get the topk value. + */ +template +__global__ void KeMatrixTopK(T* output, int output_stride, int* indices, + const T* src, int lds, int dim, int k) { + __shared__ Pair sh_topk[BlockSize]; + __shared__ int maxid[BlockSize / 2]; + const int tid = threadIdx.x; + const int warp = threadIdx.x / 32; + output += blockIdx.x * output_stride; + indices += blockIdx.x * k; + + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int k = 0; k < MaxLength; k++) { + topk[k].set(-INFINITY, -1); + } + while (k) { + ThreadGetTopK(topk, beam, k, + src + blockIdx.x * lds, firststep, + is_empty, max, dim, tid); + + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &output, + &indices, beam, k, tid, warp); + } +} + +template +class TopkOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use GPUPlace."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + size_t k = static_cast(ctx.Attr("k")); + + const T* input_data = input->data(); + + T* output_data = output->mutable_data(ctx.GetPlace()); + // FIXME(typhoonzero): data is always converted to type T? + int* indices_data = indices->mutable_data(ctx.GetPlace()); + + size_t input_height = input->dims()[0]; + size_t input_width = input->dims()[1]; + if (k > input_width) k = input_width; + + // NOTE: pass lds and dim same to input width. + // NOTE: old matrix implementation of stride is different to eigen. + // TODO(typhoonzero): launch kernel on specified stream. + // TODO(typhoonzero): refine this kernel. + dim3 threads(256, 1); + dim3 grid(input_height, 1); + + KeMatrixTopK<<>>( + output_data, output->dims()[1], indices_data, input_data, input_width, + input_width, int(k)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ef66acc1d569282a42be64b7a5e90f3fbdb20690 --- /dev/null +++ b/paddle/operators/top_k_op.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +class TopkKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Get the top k elements of each row of input tensor + // FIXME: only deal with matrix(2d tensor). + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + // k is determined by Attr + const size_t k = static_cast(ctx.Attr("k")); + + T* output_data = output->mutable_data(ctx.GetPlace()); + T* indices_data = indices->mutable_data(ctx.GetPlace()); + + auto eg_input = EigenMatrix::From(*input); + + // reshape input to a flattern matrix(like flat_inner_dims) + framework::DDim inputdims = input->dims(); + const size_t row = framework::product( + framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); + const size_t col = inputdims[inputdims.size() - 1]; + Eigen::DSizes flat2dims(row, col); + // NOTE: eigen shape doesn't affect paddle tensor. + eg_input.reshape(flat2dims); + + for (size_t i = 0; i < row; i++) { + std::vector> vec; + for (size_t j = 0; j < col; j++) { + vec.push_back(std::pair(eg_input(i, j), j)); + } + + std::partial_sort( + vec.begin(), vec.begin() + k, vec.end(), + [](const std::pair& l, const std::pair& r) { + return l.first > r.first; + }); + for (size_t j = 0; j < k; j++) { + output_data[i * k + j] = vec[j].first; + indices_data[i * k + j] = vec[j].second; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index bf1a321c3fa686b217d754b4920ff6f9fd8e6669..4b5a3ae3e95cff2ac8a94697f57ab1c2ed983ac3 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -49,6 +49,7 @@ USE_OP(minus); USE_OP(cos_sim); USE_CPU_ONLY_OP(gather); USE_CPU_ONLY_OP(scatter); +USE_OP(top_k); USE_OP(squared_l2_distance); USE_OP(reshape); diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 17986420220fec173bbf3ecff240d4c504f8adbd..e57f793ac42b19037e9ca43a5e4a3ac5447dc34c 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -37,7 +37,7 @@ Configuring cmake in /paddle/build ... -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ - -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} + -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_TESTING=${WITH_TESTING:-ON} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ======================================== diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 5584e29e2a155a8062f7d4f2016bd389bd9803f3..aabd2da5e499c8e648f2967e56c661ec37f025a1 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,22 +2,58 @@ set -xe -mkdir -p /paddle/build_android -cd /paddle/build_android -rm -rf /paddle/install 2>/dev/null || true -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ - -DANDROID_ABI=armeabi-v7a \ - -DANDROID_ARM_NEON=ON \ - -DANDROID_ARM_MODE=ON \ - -DHOST_C_COMPILER=/usr/bin/gcc \ - -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_C_FLAGS_RELWITHDEBINFO="-O3" \ - -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3" \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. +BUILD_ROOT=/paddle/build_android +DEST_ROOT=/paddle/install + +rm -rf $BUILD_ROOT 2>/dev/null || true +mkdir -p $BUILD_ROOT +cd $BUILD_ROOT + +if [ $ANDROID_ABI == "armeabi-v7a" ]; then + cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=$ANDROID_ABI \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DHOST_C_COMPILER=/usr/bin/gcc \ + -DHOST_CXX_COMPILER=/usr/bin/g++ \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ + -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=ON \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ + .. +elif [ $ANDROID_ABI == "arm64-v8a" ]; then + cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=$ANDROID_ABI \ + -DANDROID_ARM_MODE=ON \ + -DHOST_C_COMPILER=/usr/bin/gcc \ + -DHOST_CXX_COMPILER=/usr/bin/g++ \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ + -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=OFF \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ + .. +elif [ $ANDROID_ABI == "armeabi" ]; then + cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=$ANDROID_ABI \ + -DANDROID_ARM_MODE=ON \ + -DHOST_C_COMPILER=/usr/bin/gcc \ + -DHOST_CXX_COMPILER=/usr/bin/g++ \ + -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ + .. +else + echo "Invalid ANDROID_ABI: $ANDROID_ABI" +fi + make -j `nproc` make install -j `nproc` diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh index 004067a8f55351509caaf2bbf6d5c349a4698a79..9da71d1e8cdec4047167fe354973e6bac85fb9f0 100755 --- a/paddle/scripts/travis/build_android.sh +++ b/paddle/scripts/travis/build_android.sh @@ -22,6 +22,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_ABI=armeabi-v7a \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ -DWITH_STYLE_CHECK=OFF \ diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp index b18b73e06a6c39c3bf9717280bc6323917c80efb..2755fdd9cd1c2509cad996557c6fb24363d42d8a 100644 --- a/paddle/utils/Util.cpp +++ b/paddle/utils/Util.cpp @@ -320,6 +320,9 @@ void loadFileList(const std::string& fileListFileName, } double getMemoryUsage() { +#if defined(__ANDROID__) + return 0.0; +#else FILE* fp = fopen("/proc/meminfo", "r"); CHECK(fp) << "failed to fopen /proc/meminfo"; size_t bufsize = 256 * sizeof(char); @@ -357,6 +360,7 @@ double getMemoryUsage() { delete[] buf; double usedMem = 1.0 - 1.0 * (freeMem + bufMem + cacheMem) / totalMem; return usedMem; +#endif } SyncThreadPool* getGlobalSyncThreadPool() { diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h index 613844669d2495ada7b8f7a841f47b821b7fdeba..22ce2534d3468ded36221810aa61c15b37f13f3d 100644 --- a/paddle/utils/Util.h +++ b/paddle/utils/Util.h @@ -33,6 +33,13 @@ limitations under the License. */ #include "Flags.h" #include "hl_gpu.h" +#if defined(__ANDROID__) && (__ANDROID_API__ < 21) +inline int rand_r(unsigned int* seedp) { + (void)seedp; + return rand(); +} +#endif + /** * Loop over the elements in a container * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach, diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 4ddf023780c704cb10c51ee9e5d7cb63420f9d73..ebf0911d6ea0b39d51447859ae2aef485b50b0e6 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -271,6 +271,7 @@ message ImageConfig { // The size of input feature map. required uint32 img_size = 8; optional uint32 img_size_y = 9; + optional uint32 img_size_z = 10 [ default = 1 ]; } message PriorBoxConfig { @@ -287,6 +288,11 @@ message PadConfig { repeated uint32 pad_w = 4; } +message ReshapeConfig { + repeated uint32 height_axis = 1; + repeated uint32 width_axis = 2; +} + message MultiBoxLossConfig { required uint32 num_classes = 1; required float overlap_threshold = 2; @@ -339,7 +345,6 @@ message LayerInputConfig { } message LayerConfig { - required string name = 1; required string type = 2; optional uint64 size = 3; @@ -515,7 +520,11 @@ message LayerConfig { // for HuberRegressionLoss optional double delta = 57 [ default = 1.0 ]; + // for 3D data optional uint64 depth = 58 [ default = 1 ]; + + // for switch order layer + optional ReshapeConfig reshape_conf = 59; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 152a56190c1ffddbf9590ed8f71308ceb88403f4..7e9112b43bf851575a3a798886d8b1b17e7c2017 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1332,6 +1332,12 @@ def parse_image(image, input_layer_name, image_conf): get_img_size(input_layer_name, image_conf.channels) +def parse_image3d(image, input_layer_name, image_conf): + image_conf.channels = image.channels + image_conf.img_size, image_conf.img_size_y, image_conf.img_size_z = \ + get_img3d_size(input_layer_name, image_conf.channels) + + def parse_norm(norm, input_layer_name, norm_conf): norm_conf.norm_type = norm.norm_type config_assert( @@ -2365,9 +2371,11 @@ class BatchNormLayer(LayerBase): name, inputs, bias=True, + img3D=False, use_global_stats=True, moving_average_fraction=0.9, batch_norm_type=None, + mean_var_names=None, **xargs): if inputs is None: inputs = [] @@ -2409,24 +2417,69 @@ class BatchNormLayer(LayerBase): input_layer = self.get_input_layer(0) image_conf = self.config.inputs[0].image_conf - parse_image(self.inputs[0].image, input_layer.name, image_conf) - - # Only pass the width and height of input to batch_norm layer - # when either of it is non-zero. - if input_layer.width != 0 or input_layer.height != 0: - self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size, - image_conf.channels, False) + if img3D: + parse_image3d(self.inputs[0].image, input_layer.name, image_conf) + # Only pass the width and height of input to batch_norm layer + # when either of it is non-zero. + if input_layer.width != 0 or input_layer.height != 0: + self.set_cnn_layer( + input_layer_name=name, + depth=image_conf.img_size_z, + height=image_conf.img_size_y, + width=image_conf.img_size, + channels=image_conf.channels, + is_print=True) + else: + self.set_layer_size(input_layer.size) else: - self.set_layer_size(input_layer.size) + parse_image(self.inputs[0].image, input_layer.name, image_conf) + # Only pass the width and height of input to batch_norm layer + # when either of it is non-zero. + if input_layer.width != 0 or input_layer.height != 0: + self.set_cnn_layer( + input_layer_name=name, + height=image_conf.img_size_y, + width=image_conf.img_size, + channels=image_conf.channels, + is_print=True) + else: + self.set_layer_size(input_layer.size) psize = self.calc_parameter_size(image_conf) dims = [1, psize] + if mean_var_names is not None: + assert len(mean_var_names) == 2 + self.inputs[1].parameter_name = mean_var_names[0] + self.inputs[2].parameter_name = mean_var_names[1] + self.create_input_parameter(0, psize) self.create_input_parameter(1, psize, dims) self.create_input_parameter(2, psize, dims) self.create_bias_parameter(bias, psize) + def set_cnn_layer(self, + input_layer_name, + depth=None, + height=None, + width=None, + channels=None, + is_print=True): + depthIsNone = False + if depth is None: + depth = 1 + depthIsNone = True + size = depth * height * width * channels + self.set_layer_size(size) + self.set_layer_height_width(height, width) + self.set_layer_depth(depth) + if is_print and depthIsNone: + print("output for %s: c = %d, h = %d, w = %d, size = %d" % + (input_layer_name, channels, height, width, size)) + elif is_print: + print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" % + (input_layer_name, channels, depth, height, width, size)) + def calc_parameter_size(self, image_conf): return image_conf.channels @@ -2688,9 +2741,20 @@ class AddToLayer(LayerBase): super(AddToLayer, self).__init__( name, 'addto', 0, inputs=inputs, **xargs) config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer') - for input_index in xrange(len(self.inputs)): - input_layer = self.get_input_layer(input_index) - self.set_layer_size(input_layer.size) + + if len(self.inputs) > 1: + for input_index in xrange(len(self.inputs)): + assert self.get_input_layer(0).height == self.get_input_layer( + input_index).height + assert self.get_input_layer(0).width == self.get_input_layer( + input_index).width + assert self.get_input_layer(0).depth == self.get_input_layer( + input_index).depth + + self.set_layer_size(self.get_input_layer(0).size) + self.set_layer_height_width(self.get_input_layer(0).height, \ + self.get_input_layer(0).width) + self.set_layer_depth(self.get_input_layer(0).depth) self.create_bias_parameter(bias, self.config.size) @@ -3370,11 +3434,20 @@ class ConcatenateLayer(LayerBase): name, 'concat', 0, inputs=inputs, **xargs) size = 0 for input_index in xrange(len(self.inputs)): + assert self.get_input_layer(0).height == self.get_input_layer( + input_index).height + assert self.get_input_layer(0).width == self.get_input_layer( + input_index).width + assert self.get_input_layer(0).depth == self.get_input_layer( + input_index).depth input_layer = self.get_input_layer(input_index) input = self.inputs[input_index] if self.config.size == 0: size += input_layer.size + self.set_layer_height_width(self.get_input_layer(0).height, \ + self.get_input_layer(0).width) + self.set_layer_depth(self.get_input_layer(0).depth) self.set_layer_size(size) @@ -3670,6 +3743,15 @@ class RecurrentLayerGroup(LayerBase): name, 'recurrent_layer_group', 0, inputs=[], device=device) +@config_layer('switch_order') +class SwitchOrderLayer(LayerBase): + def __init__(self, name, inputs, reshape, **xargs): + super(SwitchOrderLayer, self).__init__( + name, 'switch_order', 0, inputs=inputs, **xargs) + self.config.reshape_conf.heightAxis.extend(reshape['height']) + self.config.reshape_conf.widthAxis.extend(reshape['width']) + + # Deprecated, use a new layer specific class instead @config_func def Layer(name, type, **xargs): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 47ac601e678013aceb62005d6f25595f49673d2c..dc68c213da66ac680e6b14266cb5038a5ba73ec2 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -131,6 +131,7 @@ __all__ = [ 'row_conv_layer', 'dropout_layer', 'prelu_layer', + 'switch_order_layer', 'gated_unit_layer', 'crop_layer', 'sub_nested_seq_layer', @@ -239,6 +240,7 @@ class LayerType(object): SMOOTH_L1 = 'smooth_l1' PRELU = 'prelu' + SWITCH_ORDER_LAYER = 'switch_order' CROP_LAYER = 'crop' SUB_NESTED_SEQ = 'sub_nested_seq' CLIP_LAYER = 'clip' @@ -352,6 +354,10 @@ class LayerOutput(object): def height(self): return cp.g_layer_map[self.full_name].height + @property + def depth(self): + return cp.g_layer_map[self.full_name].depth + def set_input(self, input): """ Set the input for a memory layer. Can only be used for memory layer @@ -941,7 +947,7 @@ def data_layer(name, size, depth=None, height=None, width=None, if height is not None and width is not None: num_filters = size / (width * height * depth) assert num_filters * width * height * depth == size, \ - "size=%s width=%s height=%s depth=%s" % (size, width, height, depth) + "size=%s width=%s height=%s depth=%s" % (size, width, height, depth) return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters) @@ -2951,13 +2957,15 @@ def img_cmrnorm_layer(input, def batch_norm_layer(input, act=None, name=None, + img3D=False, num_channels=None, bias_attr=None, param_attr=None, layer_attr=None, batch_norm_type=None, moving_average_fraction=0.9, - use_global_stats=None): + use_global_stats=None, + mean_var_names=None): """ Batch Normalization Layer. The notation of this layer as follow. @@ -3024,6 +3032,8 @@ def batch_norm_layer(input, :math:`runningMean = newMean*(1-factor) + runningMean*factor` :type moving_average_fraction: float. + :param mean_var_names: [mean name, variance name] + :type mean_var_names: string list :return: LayerOutput object. :rtype: LayerOutput """ @@ -3037,6 +3047,7 @@ def batch_norm_layer(input, (batch_norm_type == "cudnn_batch_norm") l = Layer( name=name, + img3D=img3D, inputs=Input( input.name, image=Image(channels=num_channels), **param_attr.attr), active_type=act.name, @@ -3045,6 +3056,7 @@ def batch_norm_layer(input, bias=ParamAttr.to_bias(bias_attr), moving_average_fraction=moving_average_fraction, use_global_stats=use_global_stats, + mean_var_names=mean_var_names, **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( @@ -6404,6 +6416,54 @@ def gated_unit_layer(input, layer_attr=layer_attr) +@layer_support() +@wrap_name_default('switch_order') +def switch_order_layer(input, + name=None, + reshape_axis=None, + act=None, + layer_attr=None): + """ + This layer switch dimension order of image input. + From order "batchSize, channels, height, width" + to order "batchSize, height, width, channels". + + The example usage is: + + .. code-block:: python + reshape_axis = 3 + switch = switch_order(input=layer, name='switch', reshape_axis=reshape_axis) + reshape = {'height':[ 0, 1, 2], 'width':[3]} + + :param input: The input layer. + :type input: LayerOutput + :param name: Name of this layer. + :type name: basestring + :param reshape: reshape matrix by axises. + :type reshape: Dict + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(input, LayerOutput) + assert reshape_axis != None and (reshape_axis > 0 and reshape_axis < 4) + height = [ele for ele in xrange(reshape_axis)] + width = [ele for ele in range(reshape_axis, 4)] + reshape = {'height': height, 'width': width} + + l = Layer( + name=name, + inputs=input.name, + reshape=reshape, + type=LayerType.SWITCH_ORDER_LAYER, + active_type=act.name, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name=name, + layer_type=LayerType.SWITCH_ORDER_LAYER, + parents=input, + size=l.config.size) + + @wrap_name_default() @layer_support() def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index df872a90ff388f0d96cef44763dbd076bc768ab9..8a204a96f3ef57673cef65306d0bf8e8c3409751 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer -test_conv3d_layer test_deconv3d_layer) +test_conv3d_layer test_deconv3d_layer test_BatchNorm3D) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr index 1a577b8d9b1e1915236ba6afcfa97040d70c707a..5ddf6052df021b055390a42c25ce6c0d650e4aee 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr @@ -62,6 +62,7 @@ layers { moving_average_fraction: 0.9 height: 227 width: 227 + depth: 1 } layers { name: "__crmnorm_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr index 2818389b16cca75f5030b75fc4de8c89c06c5e02..c0252b945b4c7fd6b4dad8770e3e1dccb88df28a 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr @@ -62,6 +62,7 @@ layers { moving_average_fraction: 0.9 height: 256 width: 256 + depth: 1 } layers { name: "__crmnorm_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr new file mode 100644 index 0000000000000000000000000000000000000000..832ed24a31dd2bedba9a4fce77d7a088d1796fdb --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr @@ -0,0 +1,92 @@ +type: "nn" +layers { + name: "data3D" + type: "data" + size: 360 + active_type: "" + height: 6 + width: 20 + depth: 3 +} +layers { + name: "__batch_norm_0__" + type: "batch_norm" + size: 360 + active_type: "relu" + inputs { + input_layer_name: "data3D" + input_parameter_name: "___batch_norm_0__.w0" + image_conf { + channels: 1 + img_size: 20 + img_size_y: 6 + img_size_z: 3 + } + } + inputs { + input_layer_name: "data3D" + input_parameter_name: "___batch_norm_0__.w1" + } + inputs { + input_layer_name: "data3D" + input_parameter_name: "___batch_norm_0__.w2" + } + bias_parameter_name: "___batch_norm_0__.wbias" + moving_average_fraction: 0.9 + height: 6 + width: 20 + depth: 3 +} +parameters { + name: "___batch_norm_0__.w0" + size: 1 + initial_mean: 1.0 + initial_std: 0.0 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___batch_norm_0__.w1" + size: 1 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false + is_static: true + is_shared: true +} +parameters { + name: "___batch_norm_0__.w2" + size: 1 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false + is_static: true + is_shared: true +} +parameters { + name: "___batch_norm_0__.wbias" + size: 1 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false +} +input_layer_names: "data3D" +output_layer_names: "__batch_norm_0__" +sub_models { + name: "root" + layer_names: "data3D" + layer_names: "__batch_norm_0__" + input_layer_names: "data3D" + output_layer_names: "__batch_norm_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr index b110e91498ce7d112987714bd769868179141c54..8a1399efad0ff339e35f69400ac654a4787a6018 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr @@ -74,6 +74,9 @@ layers { inputs { input_layer_name: "__bidirectional_gru_0___bw" } + height: 0 + width: 0 + depth: 1 } parameters { name: "___bidirectional_gru_0___fw_transform.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr index 8133aa9c8d3e7c6843d1b27b70e87d394a1e0e47..046037936a6d85f54095c65f206e468aa69065d7 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr @@ -16,6 +16,9 @@ layers { inputs { input_layer_name: "data" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_1__" @@ -28,6 +31,9 @@ layers { inputs { input_layer_name: "__addto_0__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_2__" @@ -40,6 +46,9 @@ layers { inputs { input_layer_name: "__addto_1__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_3__" @@ -52,6 +61,9 @@ layers { inputs { input_layer_name: "__addto_2__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_4__" @@ -64,6 +76,9 @@ layers { inputs { input_layer_name: "__addto_3__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_5__" @@ -76,6 +91,9 @@ layers { inputs { input_layer_name: "__addto_4__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_6__" @@ -88,6 +106,9 @@ layers { inputs { input_layer_name: "__addto_5__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_7__" @@ -100,6 +121,9 @@ layers { inputs { input_layer_name: "__addto_6__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_8__" @@ -112,6 +136,9 @@ layers { inputs { input_layer_name: "__addto_7__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_9__" @@ -124,6 +151,9 @@ layers { inputs { input_layer_name: "__addto_8__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_10__" @@ -136,6 +166,9 @@ layers { inputs { input_layer_name: "__addto_9__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_11__" @@ -148,6 +181,9 @@ layers { inputs { input_layer_name: "__addto_10__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_12__" @@ -160,6 +196,9 @@ layers { inputs { input_layer_name: "__addto_11__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_13__" @@ -172,6 +211,9 @@ layers { inputs { input_layer_name: "__addto_12__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_14__" @@ -184,6 +226,9 @@ layers { inputs { input_layer_name: "__addto_13__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_15__" @@ -196,6 +241,9 @@ layers { inputs { input_layer_name: "__addto_14__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_16__" @@ -208,6 +256,9 @@ layers { inputs { input_layer_name: "__addto_15__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_17__" @@ -220,6 +271,9 @@ layers { inputs { input_layer_name: "__addto_16__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_18__" @@ -232,6 +286,9 @@ layers { inputs { input_layer_name: "__addto_17__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_19__" @@ -244,6 +301,9 @@ layers { inputs { input_layer_name: "__addto_18__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_20__" @@ -256,6 +316,9 @@ layers { inputs { input_layer_name: "__addto_19__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_21__" @@ -268,6 +331,9 @@ layers { inputs { input_layer_name: "__addto_20__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_22__" @@ -280,6 +346,9 @@ layers { inputs { input_layer_name: "__addto_21__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_23__" @@ -292,6 +361,9 @@ layers { inputs { input_layer_name: "__addto_22__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_24__" @@ -304,6 +376,9 @@ layers { inputs { input_layer_name: "__addto_23__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_25__" @@ -316,6 +391,9 @@ layers { inputs { input_layer_name: "__addto_24__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_26__" @@ -328,6 +406,9 @@ layers { inputs { input_layer_name: "__addto_25__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_27__" @@ -340,6 +421,9 @@ layers { inputs { input_layer_name: "__addto_26__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_28__" @@ -352,6 +436,9 @@ layers { inputs { input_layer_name: "__addto_27__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_29__" @@ -364,6 +451,9 @@ layers { inputs { input_layer_name: "__addto_28__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_30__" @@ -376,6 +466,9 @@ layers { inputs { input_layer_name: "__addto_29__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__addto_31__" @@ -388,6 +481,9 @@ layers { inputs { input_layer_name: "__addto_30__" } + height: 0 + width: 0 + depth: 1 } layers { name: "__fc_layer_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr index d0ad388165007b8f96f059e5b003c52f756383e5..7a2f3eab38808a031c27cf7ab9d6273952e389eb 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr @@ -22,6 +22,9 @@ layers { inputs { input_layer_name: "b" } + height: 0 + width: 0 + depth: 1 } layers { name: "__concat_0__" @@ -34,6 +37,9 @@ layers { inputs { input_layer_name: "b" } + height: 0 + width: 0 + depth: 1 } layers { name: "__concat_1__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py new file mode 100644 index 0000000000000000000000000000000000000000..a991b22252ba10eed895efd931108c2d8b0e52f1 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py @@ -0,0 +1,11 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-4) + +#data = data_layer(name='data', size=180, width=30, height=6) +#batchNorm = batch_norm_layer(data, num_channels=1) +#outputs(batchNorm) + +data3D = data_layer(name='data3D', size=120 * 3, width=20, height=6, depth=3) +batchNorm3D = batch_norm_layer(data3D, num_channels=1, img3D=True) +outputs(batchNorm3D) diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py index 0349407a851ebb48f69d7daef7a318cf348aad5d..c1585bcffcceb75292853018179066c9f614261e 100644 --- a/python/paddle/v2/framework/op.py +++ b/python/paddle/v2/framework/op.py @@ -4,8 +4,8 @@ import paddle.v2.framework.proto.framework_pb2 as framework_pb2 def get_all_op_protos(): """ - Get all registered op proto from Paddle C++ - :return: list of OpProto + Get all registered op proto from PaddlePaddle C++ end. + :return: A list of registered OpProto. """ protostrs = core.get_all_op_protos() ret_values = [] @@ -21,8 +21,8 @@ def is_str(s): class OpDescCreationMethod(object): """ - A Functor object to convert user input(use key word args) to OpDesc based on - OpProto. + Convert the user's input(only keyword arguments are supported) to OpDesc + based on the OpProto. :param op_proto: The OpProto object. :type op_proto: op_proto_pb2.OpProto @@ -30,17 +30,18 @@ class OpDescCreationMethod(object): def __init__(self, op_proto): if not isinstance(op_proto, framework_pb2.OpProto): - raise TypeError("Argument should be OpProto") + raise TypeError( + "Type of op_proto should be OpProto in PaddlePaddle.") self.__op_proto__ = op_proto def __call__(self, *args, **kwargs): """ - Convert user input to OpDesc. Only key-word args are supported. - :return: OpDesc based on user input + Convert user's input to OpDesc. Only keyword arguments are supported. + :return: The OpDesc based on user input. :rtype: op_desc_pb2.OpDesc """ if len(args) != 0: - raise ValueError("Only keyword arguments is supported by Paddle") + raise ValueError("Only keyword arguments are supported.") op_desc = framework_pb2.OpDesc() for input_parameter in self.__op_proto__.inputs: @@ -49,8 +50,9 @@ class OpDescCreationMethod(object): input_arguments = [input_arguments] if not input_parameter.duplicable and len(input_arguments) > 1: - raise ValueError("Input %s only accepts one input, but give %d" - % (input_parameter.name, len(input_arguments))) + raise ValueError( + "Input %s expects only one input, but %d are given." % + (input_parameter.name, len(input_arguments))) ipt = op_desc.inputs.add() ipt.parameter = input_parameter.name @@ -63,7 +65,7 @@ class OpDescCreationMethod(object): if not output_parameter.duplicable and len(output_arguments) > 1: raise ValueError( - "Output %s only accepts one output, but give %d" % + "Output %s expects only one output, but %d are given." % (output_parameter.name, len(output_arguments))) out = op_desc.outputs.add() @@ -100,15 +102,17 @@ class OpDescCreationMethod(object): pair.first = p[0] pair.second = p[1] else: - raise NotImplementedError("Not support attribute type " + - str(attr.type)) + raise NotImplementedError( + "A not supported attribute type: %s." % ( + str(attr.type))) return op_desc @staticmethod def any_is_true(generator): """ - Reduce a bool array to one. If any of them is True, then return True. + Reduce a boolean array to a single boolean parameter. If any element in + the array is True, this function will return True, otherwise False. """ for flag in generator: if flag: @@ -127,7 +131,7 @@ class OpInfo(object): def create_op_creation_method(op_proto): """ - Generate op creation method for an OpProto + Generate op creation method for an OpProto. """ method = OpDescCreationMethod(op_proto) @@ -146,20 +150,23 @@ def create_op_creation_method(op_proto): class OperatorFactory(object): def __init__(self): self.op_methods = dict() + for op_proto in get_all_op_protos(): method = create_op_creation_method(op_proto) self.op_methods[method.name] = method def __call__(self, *args, **kwargs): - if 'type' in kwargs: + if "type" in kwargs: if len(args) != 0: - raise ValueError("All Paddle argument should be key-word " - "argument except type") - t = kwargs.pop('type') + raise ValueError( + "Except the argument \"type\"," + "all of the other arguments should be keyword arguments.") + t = kwargs.pop("type") else: if len(args) != 1: - raise ValueError("All Paddle argument should be key-word " - "argument except type") + raise ValueError( + "Except the argument \"type\"," + "all of the other arguments should be keyword arguments.") t = args[0] return self.get_op_info(t).method(**kwargs) @@ -169,7 +176,7 @@ class OperatorFactory(object): def get_op_info(self, t): if t not in self.op_methods: - raise ValueError("operator %s is not registered", t) + raise ValueError("The operator: %s is not registered." % t) return self.op_methods.get(t) def get_op_input_names(self, type): @@ -184,7 +191,7 @@ class OperatorFactory(object): class __RecurrentOp__(object): __proto__ = None - type = 'recurrent' + type = "recurrent" def __init__(self): # cache recurrent_op's proto @@ -194,8 +201,8 @@ class __RecurrentOp__(object): self.__proto__ = op_proto def __call__(self, *args, **kwargs): - if self.type not in args and 'type' not in kwargs: - kwargs['type'] = self.type + if self.type not in args and "type" not in kwargs: + kwargs["type"] = self.type # create proto create_method = OpDescCreationMethod(self.__proto__) proto = create_method(*args, **kwargs) @@ -203,5 +210,5 @@ class __RecurrentOp__(object): return core.RecurrentOp.create(proto.SerializeToString()) -Operator = OperatorFactory() # Default global factory +Operator = OperatorFactory() # The default global factory RecurrentOp = __RecurrentOp__() diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt index 9d41b84e57447a549d5406ec17a3c7bba24057fb..96a70853297eb6e4d7525a85867dfe07c20ac80a 100644 --- a/python/paddle/v2/framework/tests/CMakeLists.txt +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -17,6 +17,7 @@ py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py) py_test(test_gather_op SRCS test_gather_op.py) py_test(test_scatter_op SRCS test_scatter_op.py) py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py) +py_test(test_top_k_op SRCS test_top_k_op.py) py_test(gradient_checker SRCS gradient_checker.py) diff --git a/python/paddle/v2/framework/tests/mnist.py b/python/paddle/v2/framework/tests/mnist.py index a68f302f9c344bf6d63e8d9b48836d69338c3d0b..f6f8f49b797fb6e5016a5e309f12f192d5096431 100644 --- a/python/paddle/v2/framework/tests/mnist.py +++ b/python/paddle/v2/framework/tests/mnist.py @@ -38,9 +38,9 @@ def feed_data(name, data): assert isinstance(data, numpy.ndarray) tensor = scope.find_var(name).get_tensor() tensor.set_dims(data.shape) - if data.dtype == numpy.dtype('int32'): + if data.dtype == numpy.dtype("int32"): tensor.alloc_int(place) - elif data.dtype == numpy.dtype('float32'): + elif data.dtype == numpy.dtype("float32"): tensor.alloc_float(place) else: raise ValueError("data type not supported") @@ -74,22 +74,25 @@ def init_param(net, param_name, dims): # fc_layer def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None): """ - Add a fc layer to net + The fully connected layer. - :param input: input variable name. + :param input: The name of input variable. :type input: str - :param size: fully connected layer size. - :param act: activation name - :param param: parameter attribute, used for initialize parameters. - :param bias: bias attribute. False will not have a bias. - :param name: the name of fc layer. If not set, model will generate a - readable name - :return: output variable name. + :param size: The size of fully connected layer. + :param act: The name of activation. + :param param: The attribute of learnable parameter which can be used to + modify initialization mean and std of the parameter. + :param bias: The attribute of bias. If set False, this layer does not have + a bias. + :param name: The name of this layer. If it is not set explictly, a name + will be generated automatically. + :return: The name of the output variable. """ + if name is None: - name = 'fc_%d' % uniq_id() + name = "fc_%d" % uniq_id() if not isinstance(name, str): - raise ValueError("name should be string") + raise ValueError("The name of a layer should be a string.") input_dims = scope.find_var(input).get_tensor().get_dims() @@ -123,7 +126,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None): def cross_entropy_layer(net, input, label): - cost_name = 'cross_entropy_%d' % uniq_id() + cost_name = "cross_entropy_%d" % uniq_id() cross_entropy_op = Operator( "onehot_cross_entropy", X=input, label=label, Y=cost_name) net.append_op(cross_entropy_op) @@ -177,8 +180,8 @@ def error_rate(predict, label): return error_num / float(len(label)) -images = data_layer(name='pixel', dims=[BATCH_SIZE, 784]) -labels = data_layer(name='label', dims=[BATCH_SIZE]) +images = data_layer(name="pixel", dims=[BATCH_SIZE, 784]) +labels = data_layer(name="label", dims=[BATCH_SIZE]) fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid") fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid") predict = fc_layer(net=forward_net, input=fc2, size=10, act="softmax") diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py index 857427cdfbb4374957e249f0faa4cfc46ac0e8c7..e8a7f848dffa0529c8cb0d6599286ce0e228d180 100644 --- a/python/paddle/v2/framework/tests/test_gradient_checker.py +++ b/python/paddle/v2/framework/tests/test_gradient_checker.py @@ -7,11 +7,11 @@ from gradient_checker import get_numeric_gradient class GetNumericGradientTest(unittest.TestCase): def test_add_op(self): - add_op = Operator('add', X="X", Y="Y", Out="Z") + add_op = Operator("add", X="X", Y="Y", Out="Z") x = numpy.random.random((10, 1)).astype("float32") y = numpy.random.random((10, 1)).astype("float32") - arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X') + arr = get_numeric_gradient(add_op, {"X": x, "Y": y}, "Z", "X") self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4) def test_softmax_op(self): @@ -35,9 +35,9 @@ class GetNumericGradientTest(unittest.TestCase): dY = numpy.ones(Y.shape) dX = label_softmax_grad(Y, dY) - arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X') + arr = get_numeric_gradient(softmax_op, {"X": X}, "Y", "X") numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/framework/tests/test_lookup_table.py b/python/paddle/v2/framework/tests/test_lookup_table.py index 19eb464baa555fb67a994f3cfb4d3ed628367c73..4b7ce92c0f0492a73c158378299933a0b329948b 100644 --- a/python/paddle/v2/framework/tests/test_lookup_table.py +++ b/python/paddle/v2/framework/tests/test_lookup_table.py @@ -4,7 +4,7 @@ from op_test_util import OpTestMeta from gradient_checker import GradientChecker, create_op -class TestSigmoidOp(unittest.TestCase): +class TestLookupTableOp(unittest.TestCase): __metaclass__ = OpTestMeta def setUp(self): @@ -15,7 +15,7 @@ class TestSigmoidOp(unittest.TestCase): self.outputs = {'Out': table[ids]} -class TestSigmoidGradOp(GradientChecker): +class TestLookupTableGradOp(GradientChecker): def test_grad(self): op = create_op('lookup_table') table = np.random.random((17, 31)).astype('float32') diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index b58e4266d1588a4b6151f5f896537ded6ddd3896..8c827e242e866b267e0fc4b73c31bafa0ccc7c48 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -2,6 +2,7 @@ import unittest import numpy as np from gradient_checker import GradientChecker, create_op from op_test_util import OpTestMeta +from paddle.v2.framework.op import Operator class TestMulOp(unittest.TestCase): @@ -16,6 +17,22 @@ class TestMulOp(unittest.TestCase): self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} +class TestMulOp2(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "mul" + self.inputs = { + 'X': np.random.random((15, 4, 12, 10)).astype("float32"), + 'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32") + } + self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2} + self.outputs = { + 'Out': np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10), + self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9)) + } + + class TestMulGradOp(GradientChecker): def setUp(self): self.op = create_op("mul") @@ -49,7 +66,38 @@ class TestMulGradOp(GradientChecker): no_grad_set={"Y"}) -# TODO(dzh,qijun) : mulgrad test case need transpose feature of blas library +class TestMulGradTest2(GradientChecker): + def setUp(self): + self.op = Operator( + "mul", X="X", Y="Y", Out="Out", x_num_col_dims=2, y_num_col_dims=2) + self.inputs = { + "X": np.random.random((15, 4, 12, 10)).astype("float32"), + "Y": np.random.random((4, 30, 8, 2, 9)).astype("float32") + } + + def test_cpu_gpu_compare(self): + self.compare_grad(self.op, self.inputs) + + def test_normal(self): + self.check_grad( + self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5) + + def test_ignore_x(self): + self.check_grad( + self.op, + self.inputs, ["Y"], + "Out", + max_relative_error=0.5, + no_grad_set={"X"}) + + def test_ignore_y(self): + self.check_grad( + self.op, + self.inputs, ["X"], + "Out", + max_relative_error=0.5, + no_grad_set={"Y"}) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py index 2ddb85e2e7a98a08bd1d6e24e6f812f6021142e8..8378c1cd21c21fd31da9b82d2cdaaff332f291d7 100644 --- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py +++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py @@ -16,6 +16,18 @@ class TestRowwiseAddOp(unittest.TestCase): self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])} +class TestRowwiseAddOp2(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "rowwise_add" + self.inputs = { + 'X': np.random.random((13, 6, 7, 8)).astype("float32"), + 'b': np.random.random((7, 8)).astype("float32") + } + self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])} + + class TestRowwiseAddGradOp(GradientChecker): def setUp(self): self.op = create_op("rowwise_add") @@ -34,5 +46,23 @@ class TestRowwiseAddGradOp(GradientChecker): self.check_grad(self.op, self.inputs, ["b"], "Out", no_grad_set={"X"}) +class TestRowwiseAddGradOp2(GradientChecker): + def setUp(self): + self.op = create_op("rowwise_add") + self.inputs = { + "X": np.random.uniform(0.1, 1, [2, 3, 2, 5]).astype("float32"), + "b": np.random.uniform(0.1, 1, [2, 5]).astype("float32") + } + + def test_normal(self): + self.check_grad(self.op, self.inputs, ["X", "b"], "Out") + + def test_ignore_b(self): + self.check_grad(self.op, self.inputs, ["X"], "Out", no_grad_set={"b"}) + + def test_ignore_x(self): + self.check_grad(self.op, self.inputs, ["b"], "Out", no_grad_set={"X"}) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py index e670d93653e07d35e5019c9daac45c214eddf367..0d590fa7065bdd2df0e3f2aea5464f0524d70670 100644 --- a/python/paddle/v2/framework/tests/test_softmax_op.py +++ b/python/paddle/v2/framework/tests/test_softmax_op.py @@ -18,18 +18,22 @@ class TestSoftmaxOp(unittest.TestCase): def setUp(self): self.type = "softmax" - self.inputs = {'X': np.random.random((32, 100)).astype("float32")} + self.inputs = {"X": np.random.random((10, 10)).astype("float32")} self.outputs = { - 'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X']) + "Y": np.apply_along_axis(stable_softmax, 1, self.inputs["X"]) } -class SoftmaxGradOpTest(GradientChecker): - def test_softmax(self): - op = create_op("softmax") - inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")} - self.check_grad(op, inputs, set("X"), "Y") +class TestSoftmaxGradOp(GradientChecker): + def setUp(self): + self.op = create_op("softmax") + self.inputs = { + "X": np.random.uniform(0.1, 1, [10, 10]).astype("float32") + } + + def test_softmax_grad(self): + self.check_grad(self.op, self.inputs, ["X"], "Y") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/framework/tests/test_top_k_op.py b/python/paddle/v2/framework/tests/test_top_k_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e841d96d26bba13b2780c41ea7a209fd470cad3b --- /dev/null +++ b/python/paddle/v2/framework/tests/test_top_k_op.py @@ -0,0 +1,52 @@ +import unittest +import numpy as np +from gradient_checker import GradientChecker, create_op +from op_test_util import OpTestMeta + + +class TestTopkOp(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "top_k" + k = 1 + input = np.random.random((32, 84)).astype("float32") + output = np.ndarray((32, k)) + indices = np.ndarray((32, k)) + + self.inputs = {'X': input} + self.attrs = {'k': k} + + for rowid in xrange(32): + row = input[rowid] + output[rowid] = np.sort(row)[-k:] + indices[rowid] = row.argsort()[-k:] + + self.outputs = {'Out': output, 'Indices': indices} + + +class TestTopkOp3d(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "top_k" + k = 1 + input = np.random.random((32, 2, 84)).astype("float32") + input_flat_2d = input.reshape(64, 84) + output = np.ndarray((64, k)) + indices = np.ndarray((64, k)).astype("int") + + # FIXME: should use 'X': input for a 3d input + self.inputs = {'X': input_flat_2d} + self.attrs = {'k': k} + + for rowid in xrange(64): + row = input_flat_2d[rowid] + output[rowid] = np.sort(row)[-k:] + indices[rowid] = row.argsort()[-k:] + + self.outputs = {'Out': output, 'Indices': indices} + + +if __name__ == '__main__': + unittest.main()