提交 06d24e3a 编写于 作者: T tensor-tang

Merge remote-tracking branch 'upstream/develop' into inference

...@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true ...@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false BinPackParameters: false
BinPackArguments: false BinPackArguments: false
... ...
...@@ -42,7 +42,7 @@ before_install: ...@@ -42,7 +42,7 @@ before_install:
script: script:
- | - |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
- | - |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
......
...@@ -133,6 +133,8 @@ include(external/any) # download libn::any ...@@ -133,6 +133,8 @@ include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/nccl) include(external/nccl)
include(external/cares)
include(external/grpc)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
......
...@@ -29,7 +29,7 @@ RUN apt-get update && \ ...@@ -29,7 +29,7 @@ RUN apt-get update && \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \ net-tools libtool && \
apt-get clean -y apt-get clean -y
# Install Go and glide # Install Go and glide
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: c-ares is needed when linking with grpc.
SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
ExternalProject_Add(
extern_cares
GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
GIT_TAG "cares-1_13_0"
PREFIX ${CARES_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
BUILD_IN_SOURCE 1
BUILD_COMMAND make
INSTALL_COMMAND make install
)
ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
"${CARES_INSTALL_DIR}/lib/libcares.a")
include_directories(${CARES_INCLUDE_DIR})
ADD_DEPENDENCIES(cares extern_cares)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
IF(APPLE)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE()
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
ENDIF()
ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
GIT_TAG "v1.7.x"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
# NOTE(yuyang18):
# Disable -Werror, otherwise the compile will fail in MacOS.
# It seems that we cannot configure that by make command.
# Just dry run make command and remove `-Werror`, then use a shell to run make commands
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
)
# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgpr.a")
ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
include_directories(${GRPC_INCLUDE_DIR})
ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
...@@ -15,7 +15,18 @@ ...@@ -15,7 +15,18 @@
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
FIND_PACKAGE(Protobuf QUIET) FIND_PACKAGE(Protobuf QUIET)
SET(PROTOBUF_FOUND "OFF") macro(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME} CACHE)
UNSET(${VAR_NAME})
endmacro()
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(PROTOBUF_FOUND)
UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
UNSET_VAR(PROTOBUF_LITE_LIBRARY)
UNSET_VAR(PROTOBUF_LIBRARY)
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS) function(protobuf_generate_python SRCS)
...@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB) ...@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
# FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
# make `protobuf_generate_cpp` happy. # make `protobuf_generate_cpp` happy.
SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
FOREACH(dep ${protobuf_DEPS}) FOREACH(dep ${protobuf_DEPS})
ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf ${dep})
ADD_DEPENDENCIES(protobuf_lite ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep})
...@@ -128,11 +138,11 @@ endmacro() ...@@ -128,11 +138,11 @@ endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
if (NOT "${PROTOBUF_ROOT}" STREQUAL "") if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include) find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib) find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib) find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib) find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin) find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
SET_PROTOBUF_VERSION() SET_PROTOBUF_VERSION()
......
...@@ -50,6 +50,8 @@ ExternalProject_Add( ...@@ -50,6 +50,8 @@ ExternalProject_Add(
) )
LIST(APPEND external_project_dependencies zlib) LIST(APPEND external_project_dependencies zlib)
ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
......
...@@ -459,11 +459,58 @@ function(py_test TARGET_NAME) ...@@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options STATIC static SHARED shared) set(options STATIC static SHARED shared)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
${PYTHON_EXECUTABLE} ${py_test_SRCS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction() endfunction()
# grpc_library generate grpc code using grpc_cpp_plugin and protoc
# then build the generated protobuf code and grpc code with your
# implementation source codes together. Use SRCS argument for your
# implementation source files and PROTO argument for your .proto
# files.
#
# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
function(grpc_library TARGET_NAME)
set(oneValueArgs PROTO)
set(multiValueArgs SRCS DEPS)
set(options "")
cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
message(STATUS "generating grpc ${grpc_library_PROTO}")
get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
add_custom_command(
OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
--plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
# FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
# as compiler warnings instead of error. Should try remove the warnings also.
set_source_files_properties(
${grpc_grpc_srcs}
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
set_source_files_properties(
${grpc_library_SRCS}
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
endfunction()
...@@ -415,6 +415,13 @@ multiplex ...@@ -415,6 +415,13 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex .. autoclass:: paddle.v2.layer.multiplex
:noindex: :noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers Slicing and Joining Layers
========================== ==========================
......
...@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co ...@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
### CUDA version issue
There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0.
CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
```
typedef struct __align__(2) {
unsigned short x;
} __half;
typedef __half half;
```
This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
```
__global__ void Add() {
half a, b, c;
c = __hadd(a, b); // correct
c = a + b; // compiler error: no operator "+" matches these operands
}
```
CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
```
typedef struct __CUDA_ALIGN__(2) {
unsigned short x;
} __half_raw;
struct __CUDA_ALIGN__(2) __half {
protected:
unsigned short __x;
public:
// constructors and conversion operators from/to
// __half_raw and other built-in data types
}
typedef __half half;
__device__ __forceinline__
__half operator+(const __half &lh, const __half &rh) {
return __hadd(lh, rh);
}
// Other overloaded operators
```
This new design makes `c = a + b` work correctly for CUDA half data type.
## Implementation ## Implementation
The float16 class holds a 16-bit `uint16_t` data internally. The float16 class holds a 16-bit `uint16_t` data internally.
......
...@@ -2,106 +2,70 @@ ...@@ -2,106 +2,70 @@
## Abstract ## Abstract
PaddlePaddle v0.10.0 uses the "trainer-parameter server" PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
architecture. We run multiple replicated instances of trainers (runs
the same code written by the user) and parameter servers for
distributed training. This architecture served us well, but has some
limitations:
1. Need to write special code to handle tasks which should only be run 1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
by a single trainer. E.g., initializing model and saving model.
2. Model parallelism is hard: need to write if-else branches conditioned 2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
on the trainer ID to partition model onto each trainer, and manually
write the inter-model-shard communication code.
3. The user can not directly specify the parameter update rule: need 3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
to modify the parameter server C++ code and compile a new
binary. This adds complication for researchers: A lot of extra
effort is required. Besides, the training job submission program
may not allow running arbitrary binaries.
This design doc discusses PaddlePaddle's new distributed training This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
architecture that addresses the above limitations.
## Analysis ## Analysis
We will assume the user writes the trainer program by Python, the same The assumption is that the user writes the trainer program in either Python or C++.
analysis holds if the trainer program is written in C++.
### Limitation 1 ### Limitation 1
If we look at the Python code that the user writes, there are two There are two basic functionalities in the trainer program:
kinds of functionalities:
- The training logic such as load / save model and print log. 1. The training logic such as loading / saving the model and printing out the logs.
- The neural network definition such as the definition of the data 2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
layer, the fully connected layer, the cost function and the
optimizer. optimizer.
When we training with PaddlePaddle v0.10.0 distributedly, multiple When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
replicated Python instances are running on different nodes: both the training logic as well as the neural network computation logic, is replicated.
training logic and the neural network computation is replicated.
The tasks that should only run once all belong to the training logic, The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
if we only replicate the neural network computation, but do **not** replicate the training logic, the limitation mentioned above can be avoided.
replicate the training logic, the limitation could be solved.
### Limitation 2 ### Limitation 2
Model parallelism means running a single model on multiple nodes by Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
partitioning the model onto different nodes and managing the inter-model-shard communication between nodes.
inter-model-shard communications.
PaddlePaddle should be able to modify the nerual network computation PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
definition to support model parallelism automatically. However, the computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
computation is only specified in Python code, and PaddlePaddle can not
modify Python code.
Just like compiler uses a intermediate representation (IR) so that Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
programmer does not need to manually optimize their code in most of
the cases - the compiler will optimize the IR:
<img src="src/compiler.png"/> <img src="src/compiler.png"/>
We can have our own IR too: PaddlePaddle can support model parallel by PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
converting the IR so the user no longer need to manually do it in
Python:
<img src="src/paddle-compile.png"/> <img src="src/paddle-compile.png"/>
The IR for PaddlePaddle after refactor is called `Block`, it specifies The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
the computation dependency graph and the variables used in the
computation.
### Limitation 3 ### Limitation 3
The user can not directly specify the parameter update rule for the The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
parameter server because the parameter server does not use the same
computation definition as the trainer. Instead, the update rule is
baked in the parameter server. The user can not specify the update
rule in the same way of specifying the trainer computation.
This could be fixed by making the parameter server run the same This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
computation definition as the trainer. For a detailed explanation,
please
see
[Design Doc: Operation Graph Based Parameter Server](./dist_train.md) [Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
## Distributed Training Architecture ## Distributed Training Architecture
The new distributed training architecture can address the above The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
limitations. Below is the illustration:
<img src="src/distributed_architecture.png"/> <img src="src/distributed_architecture.png"/>
The architecture includes major components: *PaddlePaddle Python*, The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
*PaddlePaddle converter* and *PaddlePaddle runtime*:
### PaddlePaddle Python ### PaddlePaddle Python
PaddlePaddle Python is the Python library that user's Python trainer PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
invoke to build the neural network topology, start training, etc.
```Python ```Python
paddle.init() paddle.init()
...@@ -117,102 +81,60 @@ for i in range(1000): ...@@ -117,102 +81,60 @@ for i in range(1000):
print cost_val print cost_val
``` ```
The code above is a typical Python trainer code, the neural network The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
topology is built using helper functions such as
`paddle.layer.fc`. The training is done by calling `session.eval`
iteratively.
#### session.eval #### session.eval
As shown in the graph, `session.eval` sends the IR and the evaluation As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
inputs/targets to the PaddlePaddle cluster for evaluation. The The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
targets can be any variable in the computation graph. When the target
is the `optimizer` variable, the neural network will be optimized
once. When the target is the `cost` variable, `session.eval` returns
the cost value.
The Python `session` is a wrapper of the C++ `Session` class. For more The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
information about `Session`, please
see [Design Doc: Session](./session.md).
### PaddlePaddle Converter ### PaddlePaddle Converter
PaddlePaddle converter automatically converts the IR in the request The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
(IR and evaluation inputs/targets) from PaddlePaddle Python to new
partitioned IRs and dispatch the new IRs and evaluation inputs/targets
to different PaddlePaddle runtimes. Below are the steps:
1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that 1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
fetches the eval targets to the IR.
1. Extract a new computation (sub)graph with `feed` and `fetch` OP as 2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
the boundary. The runtime does not need to run the OP that is not
dependent by the `fetch` OP.
1. Optimizes the computation graph. 3. Optimize the computation graph.
1. Place the OPs in the graph onto different devices on different 4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
PaddlePaddle runtime according to a placement algorithm and device
constraint specified by the user.
1. Partition the graph according to runtime boundaries and add `send` / 5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
`recv` OP pair on the runtime boundaries.
1. Dispatch the partitioned graph to different PaddlePaddle runtimes. 6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
1. PaddlePaddle runtimes with the `fetch` OP reports evaluation 7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
results back to the converter, the convert reports the evaluation
results back to the PaddlePaddle Python.
The output IRs will be cached to optimize the conversion latency. The output IRs will be cached to optimize the conversion latency.
#### Placement Algorithm #### Placement Algorithm
Our first implementation will only support "trainer-parameter server" Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
placement: the parameters, initializers, and optimizers are placed on
the PaddlePaddle runtimes with the parameter server role. And
everything else will be placed on the PaddlePaddle runtimes with the
trainer role. This has the same functionality of our
"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
is more general and flexible.
In the future, we will implement the general placement algorithm, In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
which makes placements according to the input IR, and a model of
device computation time and device communication time. Model
parallelism requires the general placement algorithm.
### PaddlePaddle Runtime ### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
runs the IR. The runtime does not need to do OP placement since it's
already done by the converter.
### Local Training Architecture ### Local Training Architecture
The local training architecture will be the same as the distributed The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
training architecture, the differences are everything runs locally,
and there is just one PaddlePaddle runtime:
<img src="src/local_architecture.png"/> <img src="src/local_architecture.png"/>
### Training Data ### Training Data
In PaddlePaddle v0.10.0, training data is typically read In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
with [data reader](../reader/README.md) from Python. This approach is
no longer efficient when training distributedly since the Python When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `session.eval`. However should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
## References: ## References:
......
经典的线性回归任务
==================
PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
任务简介
--------
我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ,其中 `y = wx + b + ε`, 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
一个例子是房产估值。我们假设房产的价格(y)是其大小(x)的一个线性函数,那么我们可以通过收集市场上房子的大小和价格,用来估计线性函数的参数w 和 b。
准备数据
-----------
假设变量 `x` 和 `y` 的真实关系为: `y = 2x + 0.3 + ε`,这里展示如何使用观测数据来拟合这一线性关系。首先,Python代码将随机产生2000个观测点,作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
.. code-block:: python
# dataprovider.py
from paddle.trainer.PyDataProvider2 import *
import random
# 定义输入数据的类型: 2个浮点数
@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
def process(settings, input_file):
for i in xrange(2000):
x = random.random()
yield [x], [2*x+0.3]
训练模型
-----------
为了还原 `y = 2x + 0.3`,我们先从一条随机的直线 `y' = wx + b` 开始,然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小,最终趋于接近。这个过程就是模型的训练过程,而 `w` 和 `b` 就是模型的参数,即我们的训练目标。
在PaddlePaddle里,该模型的网络配置如下。
.. code-block:: python
# trainer_config.py
from paddle.trainer_config_helpers import *
# 1. 定义数据来源,调用上面的process函数获得观测数据
data_file = 'empty.list'
with open(data_file, 'w') as f: f.writelines(' ')
define_py_data_sources2(train_list=data_file, test_list=None,
module='dataprovider', obj='process',args={})
# 2. 学习算法。控制如何改变模型参数 w 和 b
settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
# 3. 神经网络配置
x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1)
# 线性计算网络层: ȳ = wx + b
ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
# 计算误差函数,即 ȳ 和真实 y 之间的距离
cost = square_error_cost(input= ȳ, label=y)
outputs(cost)
这段简短的配置展示了PaddlePaddle的基本用法:
- 第一部分定义了数据输入。一般情况下,PaddlePaddle先从一个文件列表里获得数据文件地址,然后交给用户自定义的函数(例如上面的 `process`函数)进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件,所以放一个空列表(`empty.list`)即可。
- 第二部分主要是选择学习算法,它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法,这里使用一个基于momentum的随机梯度下降(SGD)算法,该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层,所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元:
- **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。
- **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。
- **回归误差代价层**:回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。
定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令:
.. code-block:: bash
paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
PaddlePaddle将在观测数据集上迭代训练30轮,并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到,随着轮数增加误差代价函数的输出在不断的减小,这意味着模型在训练数据上不断的改进,直到逼近真实解:` y = 2x + 0.3 `
模型检验
-----------
训练完成后,我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测,评价预测的效果。在这个例子中,由于已经知道了真实答案,我们可以直接观察模型的参数是否符合预期来进行检验。
PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件,所以可以利用如下方法读取模型的参数。
.. code-block:: python
import numpy as np
import os
def load(file_name):
with open(file_name, 'rb') as f:
f.read(16) # skip header for float type.
return np.fromfile(f, dtype=np.float32)
print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
# w=1.999743, b=0.300137
.. image:: ./parameters.png
:align: center
:scale: 80 %
从图中可以看到,虽然 `w` 和 `b` 都使用随机值初始化,但在起初的几轮训练中它们都在快速逼近真实值,并且后续仍在不断改进,使得最终得到的模型几乎与真实模型一致。
这样,我们用PaddlePaddle解决了单变量线性回归问题, 包括数据输入、模型训练和最后的结果验证。
Simple Linear Regression
========================
PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
Problem Background
------------------
Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
Prepare the Data
-----------------
Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
.. code-block:: python
# dataprovider.py
from paddle.trainer.PyDataProvider2 import *
import random
# define data types of input: 2 real numbers
@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
def process(settings, input_file):
for i in xrange(2000):
x = random.random()
yield [x], [2*x+0.3]
Train a NeuralNetwork
----------------------
To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
.. code-block:: python
# trainer_config.py
from paddle.trainer_config_helpers import *
# 1. read data. Suppose you saved above python code as dataprovider.py
data_file = 'empty.list'
with open(data_file, 'w') as f: f.writelines(' ')
define_py_data_sources2(train_list=data_file, test_list=None,
module='dataprovider', obj='process',args={})
# 2. learning algorithm
settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
# 3. Network configuration
x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1)
y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
cost = square_error_cost(input=y_predict, label=y)
outputs(cost)
Some of the most fundamental usages of PaddlePaddle are demonstrated:
- The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
- The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
- Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
- **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
- **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
- **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
Now that everything is ready, you can train the network with a simple command line call:
.. code-block:: bash
paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
Evaluate the Model
-------------------
Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
.. code-block:: python
import numpy as np
import os
def load(file_name):
with open(file_name, 'rb') as f:
f.read(16) # skip header for float type.
return np.fromfile(f, dtype=np.float32)
print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
# w=1.999743, b=0.300137
.. image:: parameters.png
:align: center
Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
从源码编译
======================
.. _build_step:
编译方法
----------------
PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
编译PaddlePaddle,需要执行:
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
# 如果不使用Docker编译环境,执行下面的命令
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
.. code-block:: bash
pip install python/dist/*.whl
.. _run_test:
执行单元测试
----------------
如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
如果不使用Docker,可以执行ctest命令即可:
.. code-block:: bash
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
ctest
# 指定执行其中一个单元测试 test_mul_op
ctest -R test_mul_op
.. _compile_deps:
编译依赖
----------------
PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。
.. csv-table:: PaddlePaddle编译依赖
:header: "依赖", "版本", "说明"
:widths: 10, 15, 30
"CMake", ">=3.5", ""
"GCC", "4.8.2", "推荐使用CentOS的devtools2"
"Python", "2.7.x", "依赖libpython2.7.so"
"pip", ">=9.0", ""
"numpy", "", ""
"SWIG", ">=2.0", ""
"Go", ">=1.8", "可选"
.. _build_options:
编译选项
----------------
PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。
用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考
`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如:
.. code-block:: bash
cmake .. -DWITH_GPU=OFF
.. csv-table:: 编译选项说明
:header: "选项", "说明", "默认值"
:widths: 1, 7, 2
"WITH_GPU", "是否支持GPU", "ON"
"WITH_C_API", "是否仅编译CAPI", "OFF"
"WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
"WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON"
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
"WITH_TESTING", "是否开启单元测试", "ON"
"WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
"WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON"
BLAS
+++++
PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,
还会下载MKL-DNN数学库,详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
如果关闭MKL,则会使用OpenBLAS作为BLAS库。
CUDA/cuDNN
+++++++++++
PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。
PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。
我们推荐使用最新版本的cuDNN。
编译选项的设置
++++++++++++++
PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
.. code-block:: bash
cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。**
Installing from Sources
==========================
* [1. Download and Setup](#download)
* [2. Requirements](#requirements)
* [3. Build on Ubuntu](#ubuntu)
* [4. Build on Centos](#centos)
## <span id="download">Download and Setup</span>
You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
```bash
git clone https://github.com/PaddlePaddle/Paddle paddle
cd paddle
```
## <span id="requirements">Requirements</span>
To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
- **BLAS**: MKL, OpenBlas or ATLAS
- **Python**: only support Python 2.7
- **Go**
**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
For CUDA 8.0, GCC versions later than 5.3 are not supported!
### Options
PaddlePaddle supports some build options.
<html>
<table>
<thead>
<tr>
<th scope="col" class="left">Optional</th>
<th scope="col" class="left">Description</th>
</tr>
</thead>
<tbody>
<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
</tbody>
</table>
</html>
**Note:**
- The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
- Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
- **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
As a simple example, consider the following:
1. **BLAS Dependencies(optional)**
CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
```bash
# specify MKL
cmake .. -DMKL_ROOT=<mkl_path>
# or specify OpenBLAS
cmake .. -DOPENBLAS_ROOT=<openblas_path>
```
2. **Doc Dependencies(optional)**
To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
```bash
pip install 'sphinx>=1.4.0'
pip install sphinx_rtd_theme recommonmark
# install doxygen on Ubuntu
sudo apt-get install doxygen
# install doxygen on Mac OS X
brew install doxygen
# active docs in cmake
cmake .. -DWITH_DOC=ON`
```
## <span id="ubuntu">Build on Ubuntu 14.04</span>
### Install Dependencies
- **Paddle Dependencies**
```bash
# necessary
sudo apt-get update
sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
sudo apt-get install -y python python-pip python-numpy libpython-dev bison
sudo pip install 'protobuf==3.1.0.post1'
# Install Go
# You can follow https://golang.org/doc/install for a detailed explanation.
wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C $HOME -xzf go.tgz && \
mkdir $HOME/gopath && \
rm go.tgz
# Setup environment variables
export GOROOT=$HOME/go
export GOPATH=$HOME/gopath
export PATH=$PATH:$GOROOT/bin
# install cmake 3.4
curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
cd .. && rm -rf cmake-3.4.1
```
- **GPU Dependencies (optional)**
To build GPU version, you will need the following installed:
1. a CUDA-capable GPU
2. A supported version of Linux with a GCC compiler and toolchain
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
distribution versions that have been qualified for this CUDA Toolkit release.
After downloading cuDNN library, issue the following commands:
```bash
sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
```
Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
```bash
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
```
### Build and Install
As usual, the best option is to create build folder under paddle project directory.
```bash
mkdir build && cd build
```
Finally, you can build and install PaddlePaddle:
```bash
# you can add build option here, such as:
cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH
# install PaddlePaddle Python modules.
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
```
## <span id="centos">Build on Centos 7</span>
### Install Dependencies
- **CPU Dependencies**
```bash
# necessary
sudo yum update
sudo yum install -y epel-release
sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
sudo pip install wheel numpy
sudo pip install 'protobuf>=3.0.0'
```
- **GPU Dependencies (optional)**
To build GPU version, you will need the following installed:
1. a CUDA-capable GPU
2. A supported version of Linux with a GCC compiler and toolchain
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
distribution versions that have been qualified for this CUDA Toolkit release.
After downloading cuDNN library, issue the following commands:
```bash
sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
```
Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
```bash
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
```
### Build and Install
As usual, the best option is to create build folder under paddle project directory.
```bash
mkdir build && cd build
```
Finally, you can build and install PaddlePaddle:
```bash
# you can add build option here, such as:
cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH
# install PaddlePaddle Python modules.
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
```
Build from Sources
==========================
.. _build_step:
How To Build
----------------
PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
tools. We recommend you to use our pre-built Docker image to run the build
to avoid installing dependencies by yourself. We have several build environment
Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
If you choose not to use Docker image for your build, you need to install the
below `Compile Dependencies`_ before run the build.
Then run:
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# run the following command to build a CPU-Only binaries if you are using docker
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
# else run these commands
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
When the compile finishes, you can get the output whl package under
build/python/dist, then you can choose to install the whl on local
machine or copy it to the target machine.
.. code-block:: bash
pip install python/dist/*.whl
.. _run_test:
Run Tests
----------------
If you wish to run the tests, you may follow the below steps:
When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
If you don't use Docker, just run ctest will start the tests:
.. code-block:: bash
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
make
ctest
# run a single test like test_mul_op
ctest -R test_mul_op
.. _compile_deps:
Compile Dependencies
----------------
PaddlePaddle need the following dependencies when compiling, other dependencies
will be downloaded automatically.
.. csv-table:: PaddlePaddle Compile Dependencies
:header: "Dependency", "Version", "Description"
:widths: 10, 15, 30
"CMake", ">=3.5", ""
"GCC", "4.8.2", "Recommend devtools2 for CentOS"
"Python", "2.7.x", "Need libpython2.7.so"
"pip", ">=9.0", ""
"numpy", "", ""
"SWIG", ">=2.0", ""
"Go", ">=1.8", "Optional"
.. _build_options:
Build Options
----------------
Build options include whether build binaries for CPU or GPU, which BLAS
library to use etc. You may pass these settings when running cmake.
For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
.. _build_options_bool:
Bool Type Options
----------------
You can add :code:`-D` argument to pass such options, like:
.. code-block:: bash
cmake .. -DWITH_GPU=OFF
.. csv-table:: Bool Type Options
:header: "Option", "Description", "Default"
:widths: 1, 7, 2
"WITH_GPU", "Build with GPU support", "ON"
"WITH_C_API", "Build only CAPI", "OFF"
"WITH_DOUBLE", "Build with double precision", "OFF"
"WITH_DSO", "Dynamically load CUDA libraries", "ON"
"WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON"
"WITH_DOC", "Build documentaions", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
"WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
BLAS
+++++
PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
If you choose not to use MKL, then OpenBlAS will be used.
CUDA/cuDNN
+++++++++++
PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
automatically in order to speed up the build.
PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
you built.
Pass Compile Options
++++++++++++++
You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
When running cmake command, it will search system paths like
:code:`/usr/lib:/usr/local/lib` and then search paths that you
passed to cmake, i.e.
.. code-block:: bash
cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
PaddlePaddle的编译选项
======================
PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
Bool型的编译选项
----------------
用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如
.. code-block:: bash
cmake .. -DWITH_GPU=OFF
.. csv-table:: Bool型的编译选项
:widths: 1, 7, 2
:file: compile_options.csv
BLAS/CUDA/Cudnn的编译选项
--------------------------
BLAS
+++++
PaddlePaddle支持以下任意一种BLAS库:`MKL <https://software.intel.com/en-us/intel-mkl>`_ ,`ATLAS <http://math-atlas.sourceforge.net/>`_ ,`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
.. csv-table:: BLAS路径相关的编译选项
:widths: 1, 2, 7
:file: cblas_settings.csv
CUDA/Cudnn
+++++++++++
PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
编译选项的设置
++++++++++++++
PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
.. code-block:: bash
cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
编译选项,描述,注意
MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。
ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。
REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
选项,说明,默认值
WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
WITH_DOUBLE,是否使用双精度浮点数。,否
WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是
WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
WITH_RDMA,是否开启RDMA,否
WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否
WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
WITH_DOC,是否编译中英文文档,否
WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
PaddlePaddle的Docker容器使用方式 使用Docker安装运行
================================ ================================
PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。 使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
Docker使用入门 如果您在使用Windows,可以参考
------------------------------ `这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
教程,完成在Windows上安装和使用Docker。
几个基础的概念帮助理解和使用Docker:
- *镜像*:一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行:
.. code-block:: bash
docker images
来列出当前系统中的所有镜像,同样可以执行:
.. code-block:: bash
docker pull paddlepaddle/paddle:0.10.0
来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用docker.paddlepaddle.org/paddle下载。
- *容器*: 如果说一个Docker镜像就是一个程序,那容器就是这个程序运行时产生的“进程”。
实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。
可以执行:
.. code-block:: bash
docker run paddlepaddle/paddle:0.10.0
来使用一个镜像启动一个容器。 在了解Docker的基本使用方法之后,即可开始下面的步骤:
- 默认情况下,Docker容器会运行在独立的文件系统空间之上,我们无法在Docker容器中 .. _docker_pull:
访问到主机上的文件。可以通过*挂载Volume*的方式,将主机上的文件或目录挂载到
Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下,容器使用
debian镜像,并且启动后执行 :code:`ls /data`。
.. code-block:: bash 获取PaddlePaddle的Docker镜像
docker run --rm -v $(pwd):/data debian ls /data
PaddlePaddle发布的Docker镜像使用说明
------------------------------ ------------------------------
我们把PaddlePaddle的编译环境打包成一个镜像,称为开发镜像,里面涵盖了 执行下面的命令获取最新的PaddlePaddle Docker镜像
PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次
PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
和国内镜像`docker.paddlepaddle.org` 提供最新
的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。
**注意:为了方便在国内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您在国内,请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev`
这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布,
文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。
开发镜像包含了以下工具:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作,
也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发:
以交互容器方式运行开发镜像:
.. code-block:: bash .. code-block:: bash
docker run -it --rm -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /bin/bash docker pull paddlepaddle/paddle
或者,可以以后台进程方式运行容器 对于国内用户,我们提供了加速访问的镜像源
.. code-block:: bash .. code-block:: bash
docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D docker pull docker.paddlepaddle.org/paddle
然后用密码 :code:`root` SSH进入容器 下载GPU版本的Docker镜像
.. code-block:: bash .. code-block:: bash
ssh -p 2202 root@localhost docker pull paddlepaddle/paddle:latest-gpu
docker pull docker.paddlepaddle.org/paddle:latest-gpu
SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。
2. 生产镜像:根据CPU、GPU和非AVX区分了如下4个镜像:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX 选择下载使用不同的BLAS库的Docker镜像
.. code-block:: bash .. code-block:: bash
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi # 默认是使用MKL的镜像
docker pull paddlepaddle/paddle
如果输出是No,就需要选择使用no-AVX的镜像 # 使用OpenBLAS的镜像
docker pull paddlepaddle/paddle:latest-openblas
**注:在0.10.0之后的版本,PaddlePaddle都可以自动判断硬件是否支持AVX,所以无需判断AVX即可使用**
以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 下载指定版本的Docker镜像,可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag,并执行下面的命令:
为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
.. code-block:: bash .. code-block:: bash
nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash docker pull paddlepaddle/paddle:[tag]
# 比如:
docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。: .. _docker_run:
.. code-block:: bash 在Docker中执行PaddlePaddle训练程序
------------------------------
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
3. 运行以及发布您的AI程序
假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行: 假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
编写),就可以使用下面的命令开始执行训练:
.. code-block:: bash .. code-block:: bash
docker run -it -v $PWD:/work paddle /work/a.py cd /home/work
docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
如果要使用GPU,请运行: 上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work`
指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work`
.. code-block:: bash 目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py`
为容器内执行的命令,即运行训练程序。
nvidia-docker run -it -v $PWD:/work paddle /work/a.py 当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码:
.. code-block:: bash
docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
cd /work
python train.py
这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0` **注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。**
创建和发布自己的AI程序镜像。
运行PaddlePaddle Book .. _docker_run_book:
---------------------
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 使用Docker启动PaddlePaddle Book教程
------------------------------
使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。
我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
.. code-block:: bash .. code-block:: bash
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
然后在浏览器中输入以下网址: 然后在浏览器中输入以下网址:
.. code-block:: text .. code-block:: text
http://localhost:8888/ http://localhost:8888/
就这么简单,享受您的旅程! 就这么简单,享受您的旅程!
通过Docker容器开发PaddlePaddle .. _docker_run_gpu:
------------------------------
开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。
1. 制作PaddlePaddle开发镜像
PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
生成Docker镜像的方式有两个,一个是直接把一个容器转换成镜像,另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷,适合自己实验,可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚,其他人很容易看懂镜像生成过程,持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行:
.. code-block:: bash 使用Docker执行GPU训练
------------------------------
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
docker build -t paddle:dev .
docker build这个命令的-t指定了生成的镜像的名字,这里我们用paddle:dev。到此,PaddlePaddle开发镜像就被构建完毕了。
2. 制作PaddlePaddle生产镜像
生产镜像的生成分为两步,第一步是运行: 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用
`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
请不要忘记提前在物理机上安装GPU最新驱动。
.. code-block:: bash .. code-block:: bash
docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
以上命令会编译PaddlePaddle,生成运行程序,以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU,“WITH_AVX”控制生成的生产镜像是否支持AVX,”WITH_TEST“控制是否生成单元测试。
第二步是运行: **注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:**
.. code-block:: bash .. code-block:: bash
docker build -t paddle:prod -f build/Dockerfile ./build export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置,最终生成名为paddle:prod的生产镜像。 **关于AVX:**
3. 运行单元测试 AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独
`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
运行以下指令 以下指令能检查Linux电脑是否支持AVX
.. code-block:: bash .. code-block:: bash
docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
文档
----
Paddle的Docker开发镜像带有一个通过 `woboq code browser
<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。
只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码:
.. code-block:: bash
docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。 如果输出是No,就需要选择使用no-AVX的镜像
PaddlePaddle in Docker Containers Run in Docker Containers
================================= =================================
Docker container is currently the only officially-supported way to Run PaddlePaddle in Docker container so that you don't need to care about
running PaddlePaddle. This is reasonable as Docker now runs on all runtime dependencies, also you can run under Windows system. You can get
major operating systems including Linux, Mac OS X, and Windows. tutorials at `here <https://docs.docker.com/get-started/>`_ .
Please be aware that you will need to change `Dockers settings
<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
of your hardware resource on Mac OS X and Windows.
Working With Docker If you are using Windows, please refer to
------------------- `this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
tutorial to start running docker under windows.
Docker is simple as long as we understand a few basic concepts: After you've read above tutorials you may proceed the following steps.
- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type .. _docker_pull:
.. code-block:: bash Pull PaddlePaddle Docker Image
------------------------------
docker images
to list all images in the system. We can also run Run the following command to download the latest Docker images:
.. code-block:: bash .. code-block:: bash
docker pull paddlepaddle/paddle:0.10.0 docker pull paddlepaddle/paddle
to download a Docker image, paddlepaddle/paddle in this example,
from Dockerhub.com.
- *container*: considering a Docker image a program, a container is a For users in China, we provide a faster mirror:
"process" that runs the image. Indeed, a container is exactly an
operating system process, but with a virtualized filesystem, network
port space, and other virtualized environment. We can type
.. code-block:: bash .. code-block:: bash
docker run paddlepaddle/paddle:0.10.0 docker pull docker.paddlepaddle.org/paddle
to start a container to run a Docker image, paddlepaddle/paddle in this example. Download GPU version images:
- By default docker container have an isolated file system namespace,
we can not see the files in the host file system. By using *volume*,
mounted files in host will be visible inside docker container.
Following command will mount current dirctory into /data inside
docker container, run docker container from debian image with
command :code:`ls /data`.
.. code-block:: bash .. code-block:: bash
docker run --rm -v $(pwd):/data debian ls /data docker pull paddlepaddle/paddle:latest-gpu
docker pull docker.paddlepaddle.org/paddle:latest-gpu
Usage of CPU-only and GPU Images
----------------------------------
We package PaddlePaddle's compile environment into a Docker image,
called the develop image, it contains all compiling tools that
PaddlePaddle needs. We package compiled PaddlePaddle program into a
Docker image as well, called the production image, it contains all
runtime environment that running PaddlePaddle needs. For each version
of PaddlePaddle, we release both of them. Production image includes
CPU-only version and a CUDA GPU version and their no-AVX versions.
We put the docker images on `dockerhub.com
<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
latest versions under "tags" tab at dockerhub.com.
** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
1. development image :code:`paddlepaddle/paddle:<version>-dev` Choose between different BLAS version:
This image has packed related develop tools and runtime
environment. Users and developers can use this image instead of
their own local computer to accomplish development, build,
releasing, document writing etc. While different version of paddle
may depends on different version of libraries and tools, if you
want to setup a local environment, you must pay attention to the
versions. The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to
the server and run :code:`docker exec` to enter the docker
container and start their work. Also they can start a development
docker image with SSHD service, so they can login to the container
and start work.
2. Production images, this image might have multiple variants:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
Please be aware that the CPU-only and the GPU images both use the
AVX instruction set, but old computers produced before 2008 do not
support AVX. The following command checks if your Linux computer
supports AVX:
.. code-block:: bash .. code-block:: bash
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi # image using MKL by default
docker pull paddlepaddle/paddle
# image using OpenBLAS
docker pull paddlepaddle/paddle:latest-openblas
**NOTE:versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
To run the CPU-only image as an interactive container:
.. code-block:: bash If you want to use legacy versions, choose a tag from
`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash and run:
Above method work with the GPU image too -- the recommended way is
using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
Please install nvidia-docker first following this `tutorial
<https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
Now you can run a GPU image:
.. code-block:: bash .. code-block:: bash
nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash docker pull paddlepaddle/paddle:[tag]
# i.e.
docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
Train Model Using Python API
----------------------------
Our official docker image provides a runtime for PaddlePaddle
programs. The typical workflow will be as follows:
Create a directory as workspace:
.. code-block:: bash
mkdir ~/workspace
Edit a PaddlePaddle python program using your favourite editor
.. code-block:: bash
emacs ~/workspace/example.py
Run the program using docker:
.. code-block:: bash
docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
Or if you are using GPU for training: .. _docker_run:
.. code-block:: bash Launch your training program in Docker
------------------------------
nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py Assume that you have already written a PaddlePaddle program
named :code:`train.py` under directory :code:`/home/work` (refer to
Above commands will start a docker container by running :code:`python `PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
/workspace/example.py`. It will stop once :code:`python for more samples), then run the following command:
/workspace/example.py` finishes.
Another way is to tell docker to start a :code:`/bin/bash` session and
run PaddlePaddle program interactively:
.. code-block:: bash
docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
# now we are inside docker container
cd /workspace
python example.py
Running with GPU is identical:
.. code-block:: bash
nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
# now we are inside docker container
cd /workspace
python example.py
Develop PaddlePaddle or Train Model Using C++ API
---------------------------------------------------
We will be using PaddlePaddle development image since it contains all
compiling tools and dependencies.
1. Build PaddlePaddle develop image
Use following command to build PaddlePaddle develop image:
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
docker build -t paddle:dev .
2. Build PaddlePaddle production image
There are two steps for building production image, the first step is to run:
.. code-block:: bash .. code-block:: bash
docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev cd /home/work
docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. In the above command, :code:`-it` means run the container interactively;
:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
to current absolute path in Linux) under :code:`/work` in the container.
:code:`paddlepaddle/paddle` to specify image to use; finnally
:code:`/work/train.py` is the command to run inside docker.
The second step is to run: Also, you can go into the container shell, run or debug your code
interactively:
.. code-block:: bash .. code-block:: bash
docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
cd /work
python train.py
docker build -t paddle:prod -f build/Dockerfile ./build **NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
The above command will generate the production image by copying the compiled PaddlePaddle program into the image. .. _docker_run_book:
3. Run unit test
Following command will run unit test:
.. code-block:: bash
docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
PaddlePaddle Book PaddlePaddle Book
------------------ ------------------
The Jupyter Notebook is an open-source web application that allows You can create a container serving PaddlePaddle Book using Jupyter Notebook in
you to create and share documents that contain live code, equations, one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
visualizations and explanatory text in a single browser. for users and developers.If you want to
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
We already exposed port 8888 for this book. If you want to
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
We provide a packaged book image, simply issue the command: We provide a packaged book image, simply issue the command:
.. code-block:: bash .. code-block:: bash
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
Then, you would back and paste the address into the local browser: Then, you would back and paste the address into the local browser:
.. code-block:: text .. code-block:: text
http://localhost:8888/ http://localhost:8888/
That's all. Enjoy your journey! That's all. Enjoy your journey!
.. _docker_run_gpu:
Train with Docker with GPU
------------------------------
Documentation We recommend using
------------- `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
to run GPU training jobs. Please ensure you have latest
GPU driver installed before move on.
Paddle Docker images include an HTML version of C++ source code .. code-block:: bash
generated using `woboq code browser
<https://github.com/woboq/woboq_codebrowser>`_. This makes it easy nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
for users to browse and understand the C++ source code.
**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
.. code-block:: bash
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
As long as we give the Paddle Docker container a name, we can run an **About AVX:**
additional Nginx Docker container to serve the volume from the Paddle
container:
.. code-block:: bash AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
The latest PaddlePaddle Docker image turns AVX on by default, so, if your
computer doesn't support AVX, you'll probably need to
`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
docker run -d --name paddle-cpu-doc paddle:<version> The following command will tell you whether your computer supports AVX.
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
.. code-block:: bash
Then we can direct our Web browser to the HTML version of source code if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
at http://localhost:8088/paddle/
...@@ -6,11 +6,12 @@ ...@@ -6,11 +6,12 @@
安装流程 安装流程
++++++++ ++++++++
PaddlePaddle提供Docker镜像来部署环境。 PaddlePaddle提供pip和Docker的安装方式:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
pip_install_cn.rst
docker_install_cn.rst docker_install_cn.rst
...@@ -19,9 +20,14 @@ PaddlePaddle提供Docker镜像来部署环境。 ...@@ -19,9 +20,14 @@ PaddlePaddle提供Docker镜像来部署环境。
.. warning:: .. warning::
编译流程主要推荐高级用户查看,普通用户请走安装流程 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cmake/build_from_source_cn.rst build_from_source_cn.rst
常见问题解答
++++++++++
`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
Install and Build Install and Build
================= =================
Install PaddlePaddle .. _install_steps:
----------------------
Install Steps
++++++++
You can choose either pip or Docker to complete your install:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
pip_install_en.rst
docker_install_en.rst docker_install_en.rst
Build from Source Build from Source
----------------- -----------------
.. warning:: .. warning::
Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
build_from_source_en.md build_from_source_en.md
FAQ
++++++++++
`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
使用pip安装
================================
PaddlePaddle可以使用常用的Python包管理工具
`pip <https://pip.pypa.io/en/stable/installing/>`_
完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。
.. _pip_install:
使用pip安装
------------------------------
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行:
.. code-block:: bash
pip install paddlepaddle-gpu
如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装,
您可以从下面的表格中找到需要的版本:
如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载:
.. image:: paddleci.png
:scale: 50 %
:align: center
.. csv-table:: 各个版本最新的whl包
:header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
.. _pip_dependency:
运行环境依赖
------------------------------
PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。
PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。
.. csv-table:: PaddlePaddle环境依赖
:header: "依赖", "版本", "说明"
:widths: 10, 15, 30
"操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上"
"Python", "2.7.x", "暂时不支持Python3"
"libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
"libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
"libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
.. _pip_faq:
安装常见问题和解决方法
------------------------------
- paddlepaddle*.whl is not a supported wheel on this platform.
出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip:
.. code-block:: bash
pip install --upgrade pip
如果仍然存在问题,可以执行:
.. code-block:: bash
python -c "import pip; print(pip.pep425tags.get_supported())"
获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
Install Using pip
================================
You can use current widely used Python package management
tool `pip <https://pip.pypa.io/en/stable/installing/>`_
to install PaddlePaddle. This method can be used in
most of current Linux systems or MacOS.
.. _pip_install:
Install Using pip
------------------------------
Run the following command to install PaddlePaddle on the current
machine, it will also download requirements.
.. code-block:: bash
pip install paddlepaddle
If you wish to install GPU version, just run:
.. code-block:: bash
pip install paddlepaddle-gpu
If you wish to install the latest develop branch PaddlePaddle,
you can download the latest whl package from our CI system. Access
the below links, log in as guest, then click at the "Artifact"
tab, you'll find the download link of whl packages.
If the links below shows up the login form, just click "Log in as guest" to start the download:
.. image:: paddleci.png
:scale: 50 %
:align: center
.. csv-table:: whl package of each version
:header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
.. _pip_dependency:
Runtime Dependency
------------------------------
PaddlePaddle installation packages (whl) does not only contain .py files,
but also binaries built from C++ code. We ensure that PaddlePaddle can
run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
and MacOS 10.12.
PaddlePaddle whl packages are trying to satisfy
`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
standard, which uses CentOS 5 as default build environment. But CUDA libraries
seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
so we use CentOS 6 as default build environment.
.. csv-table:: PaddlePaddle Runtime Deps
:header: "Dependency", "version", "description"
:widths: 10, 15, 30
"OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later"
"Python", "2.7.x", "Currently Python3 is not supported"
"libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
"libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
"libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
.. _pip_faq:
FAQ
------------------------------
- paddlepaddle*.whl is not a supported wheel on this platform.
The main cause of this issue is that your current platform is
not supported. Please check that you are using Python 2.7 series.
Besides, pypi only supports manylinux1 standard, you'll need to
upgrade your pip to >9.0.0. Then run the below command:
.. code-block:: bash
pip install --upgrade pip
If the problem still exists, run the following command:
.. code-block:: bash
python -c "import pip; print(pip.pep425tags.get_supported())"
Then you'll get supported package suffixes, then check if it matches
the file name of the whl package. You can find default whl package at
`here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
you'll need to update pip to the latest version; If your system supports
manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
file to manylinux1_x86_64 suffix and then install.
新手入门 新手入门
============ ============
.. _quick_install:
快速安装
++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装:
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行:
.. code-block:: bash
pip install paddlepaddle-gpu
更详细的安装和编译方法参考:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
build_and_install/index_cn.rst build_and_install/index_cn.rst
concepts/use_concepts_cn.rst
- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_ .. _quick_start:
快速开始
++++++++
创建一个 housing.py 并粘贴此Python代码:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
.. toctree::
:maxdepth: 1
concepts/use_concepts_cn.rst
GET STARTED GET STARTED
============ ============
.. _quick_install:
Quick Install
----------------------
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version, run:
.. code-block:: bash
pip install paddlepaddle-gpu
For more details about installation and build:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
build_and_install/index_en.rst build_and_install/index_en.rst
- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
.. _quick_start:
Quick Start
++++++++
Create a new file called housing.py, and paste this Python
code:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
for the test housing data.
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
dev/build_cn.rst
dev/write_docs_cn.rst dev/write_docs_cn.rst
模型配置 模型配置
......
...@@ -18,7 +18,6 @@ Development ...@@ -18,7 +18,6 @@ Development
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
dev/build_en.rst
dev/new_layer_en.rst dev/new_layer_en.rst
dev/contribute_to_paddle_en.md dev/contribute_to_paddle_en.md
......
...@@ -71,7 +71,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py ...@@ -71,7 +71,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
``` ```
可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python``C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
```text ```text
Called By: Called By:
...@@ -121,7 +121,7 @@ python -m yep -v main.py ...@@ -121,7 +121,7 @@ python -m yep -v main.py
1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo` 1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`
2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
### 查看性能分析文件 ### 查看性能分析文件
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
limitations under the License. */ limitations under the License. */
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
...@@ -27,11 +29,11 @@ ...@@ -27,11 +29,11 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::ostream& operator<<(std::ostream& os, const LoD& lod) { std::ostream &operator<<(std::ostream &os, const LoD &lod) {
os << "{"; os << "{";
for (auto& v : lod) { for (auto &v : lod) {
os << "{"; os << "{";
for (auto& i : v) { for (auto &i : v) {
os << i << ","; os << i << ",";
} }
os << "}"; os << "}";
...@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) { ...@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
return os; return os;
} }
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
LoD new_lod; LoD new_lod;
new_lod.reserve(level_end - level_begin); new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) { for (size_t i = level_begin; i < level_end; i++) {
...@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { ...@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
return new_lod; return new_lod;
} }
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
size_t elem_end) { size_t elem_end) {
PADDLE_ENFORCE_LT(level, in.size()); PADDLE_ENFORCE_LT(level, in.size());
PADDLE_ENFORCE_LT(elem_end, in[level].size()); PADDLE_ENFORCE_LT(elem_end, in[level].size());
...@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, ...@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
res[0].assign(in[level].begin() + elem_begin, res[0].assign(in[level].begin() + elem_begin,
in[level].begin() + elem_end + 1); in[level].begin() + elem_end + 1);
for (size_t lvl = 1; lvl < res.size(); lvl++) { for (size_t lvl = 1; lvl < res.size(); lvl++) {
const auto& in_level = in[level + lvl]; const auto &in_level = in[level + lvl];
const auto& above_level = res[lvl - 1]; const auto &above_level = res[lvl - 1];
auto& out_level = res[lvl]; auto &out_level = res[lvl];
out_level.assign(in_level.begin() + above_level.front(), out_level.assign(in_level.begin() + above_level.front(),
in_level.begin() + above_level.back() + 1); in_level.begin() + above_level.back() + 1);
} }
...@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, ...@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
// to make the first offset equals 0, all the elements minus the first // to make the first offset equals 0, all the elements minus the first
// element // element
size_t front = res[lvl].front(); size_t front = res[lvl].front();
for (auto& ele : res[lvl]) { for (auto &ele : res[lvl]) {
ele -= front; ele -= front;
} }
} }
return res; return res;
} }
LoD ToAbsOffset(const LoD& in) { LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets // the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in; if (in.empty() || in.size() == 1) return in;
LoD result = in; LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) { for (int level = result.size() - 2; level >= 0; level--) {
for (auto& ele : result[level]) { for (auto &ele : result[level]) {
ele = result[level + 1][ele]; ele = result[level + 1][ele];
} }
} }
return result; return result;
} }
bool operator==(const LoD& a, const LoD& b) { bool operator==(const LoD &a, const LoD &b) {
if (a.size() != b.size()) { if (a.size() != b.size()) {
return false; return false;
} }
for (size_t i = 0; i < a.size(); i++) { for (size_t i = 0; i < a.size(); i++) {
const auto& a_level = a[i]; const auto &a_level = a[i];
const auto& b_level = b[i]; const auto &b_level = b[i];
if (a_level.size() != b_level.size()) { if (a_level.size() != b_level.size()) {
return false; return false;
} }
...@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin, ...@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
} }
using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>; using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx, LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) { size_t end_idx, size_t start_level) {
LoD sub_lod; LoD sub_lod;
...@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx, ...@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
return LoDAndOffset{sub_lod, {start_idx, end_idx}}; return LoDAndOffset{sub_lod, {start_idx, end_idx}};
} }
void AppendLoD(LoD* lod, const LoD& lod_length) { void AppendLoD(LoD *lod, const LoD &lod_length) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
lod->empty() || lod->size() == lod_length.size(), lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod."); "The lod_length should has the same size with the appended lod.");
...@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) { ...@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
*lod = LoD(lod_length.size(), std::vector<size_t>({0})); *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
} }
for (size_t i = 0; i < lod->size(); ++i) { for (size_t i = 0; i < lod->size(); ++i) {
auto& level = (*lod)[i]; auto &level = (*lod)[i];
for (size_t len : lod_length[i]) { for (size_t len : lod_length[i]) {
level.push_back(level.back() + len); level.push_back(level.back() + len);
} }
} }
} }
void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
framework::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod();
uint64_t size = lod.size();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
for (auto &each : lod) {
size = each.size() * sizeof(framework::LoD::value_type::value_type);
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
os.write(reinterpret_cast<const char *>(each.data()),
static_cast<std::streamsize>(size));
}
}
}
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
framework::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case framework::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case framework::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case framework::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case framework::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
is.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
lod[i] = tmp;
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset( ...@@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
void AppendLoD(LoD* lod, const LoD& lod_length); void AppendLoD(LoD* lod, const LoD& lod_length);
/*
* Serialize/Desiralize LoDTensor to std::ostream
* You can pass ofstream or ostringstream to serilize to file
* or to a in memory string. GPU tensor will be copied to CPU.
*/
void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j, "The %d-th output of Output(%s) must be LoDTensor.", j,
out); out);
in_var->SetLoDLevel(out_var->GetLodLevel()); out_var->SetLoDLevel(in_var->GetLodLevel());
} }
bool IsRuntime() const override; bool IsRuntime() const override;
......
...@@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
auto dst_ptr = static_cast<void*>(dst->data()); auto dst_ptr = static_cast<void*>(dst->data());
if (platform::is_cpu_place(src.place())) { if (platform::is_cpu_place(src.place())) {
memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), memory::Copy(dst_place, dst_ptr,
src_ptr, size); boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), src_ptr, dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
size, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
} }
} // namespace framework } // namespace framework
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "FactorizationMachineLayer.h"
#include <algorithm>
#include <vector>
#include "paddle/math/SparseMatrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
bool FactorizationMachineLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
factorSize_ = config_.factor_size();
/* initialize the latentVectors_ */
CHECK_EQ(inputLayers_.size(), 1UL);
size_t inputSize = inputLayers_[0]->getSize();
CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
latentVectors_ = std::unique_ptr<Weight>(
new Weight(inputSize, factorSize_, parameters_[0]));
return true;
}
void FactorizationMachineLayer::forward(PassType passType) {
Layer::forward(passType);
const MatrixPtr& inputV = getInputValue(0);
size_t batchSize = inputV->getHeight();
size_t outputSize = getSize();
size_t inputSize = inputLayers_[0]->getSize();
reserveOutput(batchSize, outputSize);
MatrixPtr outV = getOutputValue();
Matrix::resizeOrCreate(
latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
Matrix::resizeOrCreate(
inputMulFactor_, batchSize, factorSize_, false, useGpu_);
Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
inputMulFactor_->mul(*inputV, *latentVectors_->getW());
inputMulFactor_->square2(*tmpOut_);
outV->sumRows(*tmpOut_, 0.5, 0);
if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
Matrix::resizeOrCreateSparseMatrix(inputSquare_,
inputV->getHeight(),
inputV->getWidth(),
inputV->getElementCnt(),
inputV->getValueType());
inputSquare_->copyFrom(*inputV);
(dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
} else {
Matrix::resizeOrCreate(
inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
inputV->square2(*inputSquare_);
}
latentVectors_->getW()->square2(*latentVectorsSquare_);
tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
outV->sumRows(*tmpOut_, -0.5, 1.0);
/* activation */ {
REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
forwardActivation();
}
}
void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
/* Do derivation */ { backwardActivation(); }
const MatrixPtr& inputV = getInputValue(0);
const MatrixPtr& oGrad = getOutputGrad();
Matrix::resizeOrCreate(
tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
latentVectors_->getW()->getHeight(),
1,
false,
useGpu_);
/* Calculate the gradients of the latentVectors_ matrix */
if (latentVectors_->getWGrad()) {
if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
Matrix::resizeOrCreateSparseMatrix(tmpInput_,
inputV->getHeight(),
inputV->getWidth(),
inputV->getElementCnt());
CpuSparseMatrix* sparseInputV =
dynamic_cast<CpuSparseMatrix*>(inputV.get());
CpuSparseMatrix* sparseInputSquare =
dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
CpuSparseMatrix* sparseTmpInput =
dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
sparseTmpInput->copyFrom(*sparseInputV);
sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
latentVectors_->getWGrad()->mul(
*sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
negOnes_->zeroMem();
negOnes_->add(-1);
tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
} else {
Matrix::resizeOrCreate(
tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
tmpInput_->rowScale(0, *inputV, *oGrad);
latentVectors_->getWGrad()->mul(
*tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
tmpInput_->rowScale(0, *inputSquare_, *oGrad);
tmpSum_->sumCols(*tmpInput_, -1, 0);
}
latentVectors_->getWGrad()->addRowScale(
0, *latentVectors_->getW(), *tmpSumTrans);
/* Increasing the number of gradient */
latentVectors_->getParameterPtr()->incUpdate(callback);
}
/* Calculate the input layers gradient */
MatrixPtr inGrad = getInputGrad(0);
if (inGrad != NULL) {
inGrad->mul(
*inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
inGrad->addColScale(0, *inputV, *tmpSum_);
inGrad->rowScale(0, *inGrad, *oGrad);
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/ThreadLocal.h"
namespace paddle {
/**
* @brief The Factorization Machine models pairwise (order-2) feature
* interactions as inner product of the learned latent vectors corresponding
* to each input feature.
*
* The Factorization Machine can effectively capture feature interactions
* especially when the input is sparse. While in principle FM can model higher
* order feature interaction, in practice usually only order-2 feature
* interactions are considered. The Factorization Machine Layer here only
* computes the order-2 interations with the formula:
*
* \f[
* y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
* \f]
*
* The detailed calculation for forward and backward can be found at this paper:
*
* Factorization machines.
*
* The config file api is factorization_machine.
*/
class FactorizationMachineLayer : public Layer {
protected:
// The latent vectors, shape: (size, factorSize_)
// Each row of the latentVectors_ matrix is the latent vector
// corresponding to one input feature dimension
std::unique_ptr<Weight> latentVectors_;
// The hyperparameter that defines the dimensionality of the factorization
size_t factorSize_;
private:
// Store the square values of the letent vectors matrix
MatrixPtr latentVectorsSquare_;
// Store the square values of input matrix
MatrixPtr inputSquare_;
// The result of input matrix * latent vector matrix that will be used in
// both forward and backward step
MatrixPtr inputMulFactor_;
// Store temporary calculation result
MatrixPtr tmpOut_;
MatrixPtr tmpSum_;
MatrixPtr tmpInput_;
// Negative identity matrix
MatrixPtr negOnes_;
public:
explicit FactorizationMachineLayer(const LayerConfig& config)
: Layer(config) {}
~FactorizationMachineLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
} // namespace paddle
...@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { ...@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
batchSize, batchSize,
codeLength_, codeLength_,
/* trans */ false, /* trans */ false,
useGpu(deviceId_)); false);
Matrix::resizeOrCreate(preOutput_.grad, Matrix::resizeOrCreate(preOutput_.grad,
batchSize, batchSize,
codeLength_, codeLength_,
/* trans */ false, /* trans */ false,
useGpu(deviceId_)); false);
IVectorPtr label = getInput(*getLabelLayer()).ids; IVectorPtr label = getInput(*getLabelLayer()).ids;
preOutput_.value->zeroMem(); preOutput_.value->zeroMem();
if (useGpu_) {
Matrix::resizeOrCreate(cpuOutput_,
output_.value->getHeight(),
output_.value->getWidth(),
/* trans */ false,
false);
IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
cpuLabel_->copyFrom(*label);
cpuOutput_->copyFrom(*output_.value);
} else {
cpuOutput_ = output_.value;
cpuLabel_ = label;
}
/* add the bias-vector */ /* add the bias-vector */
if (biases_.get() != NULL) { if (biases_.get() != NULL) {
preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW()); if (useGpu_) {
Matrix::resizeOrCreate(cpuBias_,
1,
numClasses_ - 1,
/* trans */ false,
false);
cpuBias_->copyFrom(*biases_->getW());
} else {
cpuBias_ = biases_->getW();
}
preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
} }
for (size_t i = 0; i < inputLayers_.size() - 1; ++i) { for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
MatrixPtr input = getInputValue(i); MatrixPtr input = getInputValue(i);
if (useGpu_) {
Matrix::resizeOrCreate(cpuInput_,
input->getHeight(),
input->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeight_,
weights_[i]->getW()->getHeight(),
weights_[i]->getW()->getWidth(),
/* trans */ false,
false);
cpuInput_->copyFrom(*input);
cpuWeight_->copyFrom(*weights_[i]->getW());
} else {
cpuInput_ = input;
cpuWeight_ = weights_[i]->getW();
}
preOutput_.value->mulByBitCode( preOutput_.value->mulByBitCode(
numClasses_, *label, *weights_[i]->getW(), *input); numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
} }
// keep consistent with the clipping in the following softrelu // keep consistent with the clipping in the following softrelu
preOutput_.value->clip(-40.0, 40.0); preOutput_.value->clip(-40.0, 40.0);
preOutput_.value->sumByBitCode(numClasses_, preOutput_.value->sumByBitCode(numClasses_,
*label, *cpuLabel_,
*output_.value, *cpuOutput_,
-1); // scaleSum -1); // scaleSum
preOutput_.value->softrelu(*preOutput_.value); preOutput_.value->softrelu(*preOutput_.value);
MatrixPtr sum = MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
preOutput_.value->rowSum(*sum); preOutput_.value->rowSum(*sum);
output_.value->add(*sum); cpuOutput_->add(*sum);
if (useGpu_) {
output_.value->copyFrom(*cpuOutput_);
} else {
output_.value = cpuOutput_;
}
} }
void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
IVectorPtr label = getInput(*getLabelLayer()).ids; IVectorPtr label = getInput(*getLabelLayer()).ids;
if (useGpu_) {
IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
cpuLabel_->copyFrom(*label);
} else {
cpuLabel_ = label;
}
preOutput_.grad->one(); preOutput_.grad->one();
preOutput_.grad->softreluDerivative(*preOutput_.value); preOutput_.grad->softreluDerivative(*preOutput_.value);
preOutput_.grad->subByBitCode(numClasses_, *label); preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
if (biases_ && biases_->getWGrad()) { if (biases_ && biases_->getWGrad()) {
preOutput_.grad->addByBitCodeBackward( MatrixPtr biases_grad = biases_->getWGrad();
numClasses_, *label, *biases_->getWGrad()); if (useGpu_) {
Matrix::resizeOrCreate(cpuBias_,
1,
numClasses_ - 1,
/* trans */ false,
false);
cpuBias_->copyFrom(*biases_grad);
} else {
cpuBias_ = biases_grad;
}
preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
if (useGpu_) {
biases_grad->copyFrom(*cpuBias_);
} else {
biases_grad = cpuBias_;
}
/* Increasing the number of gradient */ /* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback); biases_->getParameterPtr()->incUpdate(callback);
} }
...@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { ...@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the W-gradient for the current layer */ /* Calculate the W-gradient for the current layer */
MatrixPtr input = getInputValue(i); MatrixPtr input = getInputValue(i);
if (weights_[i]->getWGrad()) { if (weights_[i]->getWGrad()) {
MatrixPtr weights_grad = weights_[i]->getWGrad();
if (useGpu_) {
Matrix::resizeOrCreate(cpuInput_,
input->getHeight(),
input->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeightGrad_,
weights_grad->getHeight(),
weights_grad->getWidth(),
/* trans */ false,
false);
cpuInput_->copyFrom(*input);
cpuWeightGrad_->copyFrom(*weights_grad);
} else {
cpuInput_ = input;
cpuWeightGrad_ = weights_grad;
}
preOutput_.grad->mulByBitCodeBackwardWeight( preOutput_.grad->mulByBitCodeBackwardWeight(
numClasses_, *label, *weights_[i]->getWGrad(), *input); numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
if (useGpu_) {
weights_grad->copyFrom(*cpuWeightGrad_);
} else {
weights_grad = cpuWeightGrad_;
}
/* Increasing the number of gradient */ /* Increasing the number of gradient */
weights_[i]->getParameterPtr()->incUpdate(callback); weights_[i]->getParameterPtr()->incUpdate(callback);
} }
...@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { ...@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the input layers error */ /* Calculate the input layers error */
MatrixPtr inputGrad = getInputGrad(i); MatrixPtr inputGrad = getInputGrad(i);
if (inputGrad) { if (inputGrad) {
if (useGpu_) {
Matrix::resizeOrCreate(cpuInputGrad_,
inputGrad->getHeight(),
inputGrad->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeight_,
weights_[i]->getW()->getHeight(),
weights_[i]->getW()->getWidth(),
/* trans */ false,
false);
cpuInputGrad_->copyFrom(*inputGrad);
cpuWeight_->copyFrom(*weights_[i]->getW());
} else {
cpuInputGrad_ = inputGrad;
cpuWeight_ = weights_[i]->getW();
}
preOutput_.grad->mulByBitCodeBackwardError( preOutput_.grad->mulByBitCodeBackwardError(
numClasses_, *label, *weights_[i]->getW(), *inputGrad); numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
if (useGpu_) {
inputGrad->copyFrom(*cpuInputGrad_);
} else {
inputGrad = cpuInputGrad_;
}
} }
} }
} }
......
...@@ -80,6 +80,15 @@ protected: ...@@ -80,6 +80,15 @@ protected:
int codeLength_; int codeLength_;
/// temporary result of output_ /// temporary result of output_
Argument preOutput_; Argument preOutput_;
/// The temporary variables in CPU memory.
MatrixPtr cpuWeight_;
MatrixPtr cpuWeightGrad_;
MatrixPtr cpuInput_;
MatrixPtr cpuInputGrad_;
MatrixPtr cpuBias_;
MatrixPtr cpuOutput_;
IVectorPtr cpuLabel_;
}; };
} // namespace paddle } // namespace paddle
...@@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) ...@@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
endif() endif()
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
################## test_Evaluator ####################### ################## test_Evaluator #######################
add_unittest(test_Evaluator add_unittest(test_Evaluator
test_Evaluator.cpp) test_Evaluator.cpp)
############### test_RecurrentGradientMachine ############### ############### test_RecurrentGradientMachine ###############
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
# I will fix it. # I will fix it.
add_unittest_without_exec(test_RecurrentGradientMachine add_unittest_without_exec(test_RecurrentGradientMachine
...@@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE) ...@@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE)
${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
############### test_NetworkCompare ############### ############### test_NetworkCompare ###############
add_unittest_without_exec(test_NetworkCompare add_unittest_without_exec(test_NetworkCompare
test_NetworkCompare.cpp) test_NetworkCompare.cpp)
if(WITH_GPU) if(WITH_GPU)
...@@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE) ...@@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE)
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif() endif()
endif()
add_unittest_without_exec(test_PyDataProvider2
test_PyDataProvider2.cpp)
add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
################# test_CompareSparse ################## ################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp) test_CompareSparse.cpp)
if(NOT ON_TRAVIS) if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6 ./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif() endif()
################ test_CompareTwoNets ###################### ################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp) test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################ test_PyDataProvider2 ######################
add_unittest_without_exec(test_PyDataProvider2
test_PyDataProvider2.cpp)
add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
...@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) { ...@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
config.layerConfig.add_inputs(); config.layerConfig.add_inputs();
config.layerConfig.add_inputs(); config.layerConfig.add_inputs();
// Not support GPU now for (auto useGpu : {false, true}) {
testLayerGrad(config, testLayerGrad(config,
"hsigmoid", "hsigmoid",
100, 100,
/* trans */ false, /* useGpu */ /* trans */ false,
false); /* useGpu */ useGpu);
}
} }
TEST(Layer, multi_cross) { TEST(Layer, multi_cross) {
...@@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) { ...@@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) {
} }
} }
void testFactorizationMachineLayer(InputType type, bool useGpu) {
const int FACTOR_SIZE = 10;
TestConfig config;
config.layerConfig.set_type("factorization_machine");
config.layerConfig.set_factor_size(FACTOR_SIZE);
config.layerConfig.set_size(1);
config.biasSize = 0;
config.inputDefs.push_back({type, "layer_0", 128, 1280});
config.layerConfig.add_inputs();
testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
}
TEST(Layer, FactorizationMachineLayer) {
for (auto useGpu : {false, true}) {
testFactorizationMachineLayer(INPUT_DATA, useGpu);
}
testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
...@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const { ...@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
os << ";"; os << ";";
} }
void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
CHECK(getFormat() != SPARSE_CSC) << "Not supported";
CHECK_EQ(height_, b.getHeight());
CHECK_EQ(width_, b.getWidth());
real* A = getValue();
real* B = b.getValue();
if (b.getValueType() == FLOAT_VALUE) {
for (size_t i = 0; i < height_; i++) {
size_t start = getRowStartIdx(i);
size_t end = getRowStartIdx(i + 1);
CHECK_EQ(start, b.getRowStartIdx(i));
CHECK_EQ(end, b.getRowStartIdx(i + 1));
for (size_t j = start; j < end; j++) {
A[j] = B[j] * c.getElement(i, cCol);
}
}
} else if (b.getValueType() == NO_VALUE) {
for (size_t i = 0; i < height_; i++) {
size_t start = getRowStartIdx(i);
size_t end = getRowStartIdx(i + 1);
CHECK_EQ(start, b.getRowStartIdx(i));
CHECK_EQ(end, b.getRowStartIdx(i + 1));
for (size_t j = start; j < end; j++) {
A[j] = c.getElement(i, cCol);
}
}
}
}
void CpuSparseMatrix::randomizeUniform() { void CpuSparseMatrix::randomizeUniform() {
CHECK_LE(elementCnt_, height_ * width_); CHECK_LE(elementCnt_, height_ * width_);
if (valueType_ == FLOAT_VALUE) { if (valueType_ == FLOAT_VALUE) {
......
...@@ -239,6 +239,15 @@ public: ...@@ -239,6 +239,15 @@ public:
const unsigned int* cols, const unsigned int* cols,
const real* values); const real* values);
/**
* @brief this_row = b_row * c_row[cCol]
*
* @param[in] cCol the column of matrix c used to scale each row of b
* @param[in] b CpuSparseMatrix
* @param[in] c Matrix
*/
void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
void randomizeUniform(); void randomizeUniform();
void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream); void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
......
...@@ -191,6 +191,7 @@ set(DEPS_OPS ...@@ -191,6 +191,7 @@ set(DEPS_OPS
sum_op sum_op
pool_op pool_op
maxout_op maxout_op
unpool_op
pool_with_index_op pool_with_index_op
conv_op conv_op
conv_transpose_op conv_transpose_op
...@@ -205,8 +206,24 @@ set(DEPS_OPS ...@@ -205,8 +206,24 @@ set(DEPS_OPS
tensor_array_read_write_op tensor_array_read_write_op
gru_op gru_op
adagrad_op adagrad_op
sgd_op) sgd_op
save_op
load_op
send_op
recv_op)
add_subdirectory(detail)
op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties(
send_op.cc
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties(
recv_op.cc
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
...@@ -219,6 +236,7 @@ op_library(adagrad_op DEPS selected_rows_functor) ...@@ -219,6 +236,7 @@ op_library(adagrad_op DEPS selected_rows_functor)
op_library(conv_op DEPS vol2col) op_library(conv_op DEPS vol2col)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(maxout_op DEPS maxouting) op_library(maxout_op DEPS maxouting)
op_library(unpool_op DEPS unpooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
...@@ -235,6 +253,10 @@ op_library(conv_transpose_op DEPS vol2col) ...@@ -235,6 +253,10 @@ op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
...@@ -242,6 +264,8 @@ endforeach() ...@@ -242,6 +264,8 @@ endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
...@@ -251,3 +275,4 @@ if(WITH_GPU) ...@@ -251,3 +275,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
...@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const TensorFormat tensor_format = const TensorFormat tensor_format =
StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format")); StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions.");
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (tensor_format == TensorFormat::NCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"Input X must have 3 to 5 dimensions.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
...@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> { ...@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
const int N = x_dims[0]; const int N = x_dims[0];
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (tensor_format == TensorFormat::NCHW ? x_dims[1]
...@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T> ...@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
// Get the size for each dimension. // Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width] // NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
const int N = x_dims[0]; const int N = x_dims[0];
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (tensor_format == TensorFormat::NCHW ? x_dims[1]
......
...@@ -29,6 +29,12 @@ void ExtractNCWHD(const framework::DDim &dims, ...@@ -29,6 +29,12 @@ void ExtractNCWHD(const framework::DDim &dims,
const TensorFormat &tensor_format, int *N, int *C, int *H, const TensorFormat &tensor_format, int *N, int *C, int *H,
int *W, int *D) { int *W, int *D) {
*N = dims[0]; *N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1]; *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
*H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1]; *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
*W = dims.size() > 3 *W = dims.size() > 3
...@@ -37,6 +43,7 @@ void ExtractNCWHD(const framework::DDim &dims, ...@@ -37,6 +43,7 @@ void ExtractNCWHD(const framework::DDim &dims,
*D = dims.size() > 4 *D = dims.size() > 4
? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3]) ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
: 1; : 1;
}
} }
template <typename T> template <typename T>
...@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> { ...@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
// NCHW [batch_size, in_channels, in_height, in_width] // NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
int N, C, H, W, D; int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
...@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T> ...@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
int N, C, H, W, D; int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
......
...@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually // cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1. // rather than setting it to 1.
...@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually // cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1. // rather than setting it to 1.
......
...@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
"(int default:1), the group size of convolution operator. " "(int default:1), the groups number of the convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the " "when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
...@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
Convolution Operator. Convolution Operator.
The convolution operation calculates the output based on the input, filter The convolution operation calculates the output based on the input, filter
and strides, paddings, groups, dilations parameters. The size of each dimension of the and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch Input(Input) and Output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements. the width of the feature.
These two elements represent height and width, respectively. Filters(Input) is MCHW format. Where M is the number of output image channels, C is
the number of input image channels, H is the height of the filter, and W
is the width of the filter.
Parameters(strides, paddings, dilations) are two elements. These two elements represent
height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, H_in, W_in) Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: (C_out, C_in, H_f, W_f) Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, H_out, W_out) Output shape: $(N, C_{out}, H_{out}, W_{out})$
where Where
H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1; $$
W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1; H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
$$
)DOC"); )DOC");
} }
...@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, ...@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0, 0}); .SetDefault({0, 0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
"(int default:1), the group size of convolution operator. " "(int default:1), the groups number of the convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the " "when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
...@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, ...@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
AddAttr<std::vector<int>>("dilations", AddAttr<std::vector<int>>("dilations",
"(vector<int> default:{1, 1, 1}), the " "(vector<int> default:{1, 1, 1}), the "
"dilations(d_dilation, h_dilation, w_dilation) of " "dilations(d_dilation, h_dilation, w_dilation) of "
"convolution operator. Currently, conv3d doesn't " "convolution operator.")
"support dilation.")
.SetDefault({1, 1, 1}); .SetDefault({1, 1, 1});
AddComment(R"DOC( AddComment(R"DOC(
Convolution3D Operator. Convolution3D Operator.
The convolution operation calculates the output based on the input, filter The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch Input(Input) and output(Output) are in NCDHW format, where N is batch
size, C is the number of channels,D is the depth of the feature, H is the height of size, C is the number of channels,D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(ksize, strides, paddings) the feature, and W is the width of the feature.
are three elements. These three elements represent depth, height and width, respectively. Filters(Input) is MCDHW format, where M is the number of output image channels,
C is the number of input image channels, D is the depth of the filter,
H is the height of the filter, and W is the width of the filter.
Parameters(strides, paddings, dilations) are three elements. These three elements
represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, D_in, H_in, W_in) Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
Filter shape: (C_out, C_in, D_f, H_f, W_f) Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, D_out, H_out, W_out) Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
where Where
D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1; $$
H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1; D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1; H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
$$
)DOC"); )DOC");
} }
......
...@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"ConvTransposeOp input dimension and strides dimension should " "ConvTransposeOp input dimension and strides dimension should "
"be consistent."); "be consistent.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
"ConvTransposeOp paddings dimension and Conv strides " "ConvTransposeOp paddings dimension and strides "
"dimension should be the same."); "dimension should be the same.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"In ConvTransposeOp, The input channel should be the same " "In ConvTransposeOp, The input channel should be the same "
...@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( ...@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
"The format of input tensor is NCHW. Where N is batch size, C is the " "The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H is the height of the feature, and " "number of input channels, H is the height of the feature, and "
"W is the width of the feature."); "W is the width of the feature.");
AddInput("Filter", AddInput(
"Filter",
"(Tensor) The filter tensor of convolution transpose operator. " "(Tensor) The filter tensor of convolution transpose operator. "
"The format of the filter tensor is CMHW, where C is the number of " "The format of the filter tensor is MCHW, where M is the number of "
"output image channels, M is the number of input image channels, " "input feature channels, C is the number of "
"output feature channels,"
"H is the height of the filter, and W is the width of the filter. " "H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 and padding == 0 in " "We enforce groups number == 1 in the convolution transpose scenario.");
"the convolution transpose scenario.");
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator. " "(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"strides", "strides",
"(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of " "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
"convolution transpose operator.") "convolution transpose operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution " "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
"transpose operator.") "transpose operator.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddComment(R"DOC( AddComment(R"DOC(
...@@ -88,21 +89,26 @@ Convolution2D Transpose Operator. ...@@ -88,21 +89,26 @@ Convolution2D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch number of channels, H is the height of the feature, and W is the width of the feature.
size, C is the number of channels, H is the height of the feature, and Filter(Input) is in MCHW format. Where M is the number of input feature channels,
W is the width of the feature. Parameters(ksize, strides, paddings) are two elements. C is the number of output feature channels, H is the height of the filter,
These two elements represent height and width, respectively. and W is the width of the filter.
Parameters(strides, paddings) are two elements. These two elements represent height
and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, H_in, W_in) Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: (C_in, C_out, H_f, W_f) Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, H_out, W_out) Output shape: $(N, C_{out}, H_{out}, W_{out})$
where Where
H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; $$
W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
$$
)DOC"); )DOC");
} }
...@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( ...@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"W is the width of the feature."); "W is the width of the feature.");
AddInput("Filter", AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator." "(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMDHW, where C is the number of " "The format of the filter tensor is MCDHW, where M is the number of "
"output image channels, M is the number of input image channels, D " "input feature channels, C is the number of "
"output feature channels, D "
"is the depth of the filter, H is the height of the filter, and " "is the depth of the filter, H is the height of the filter, and "
"W is the width of the filter." "W is the width of the filter."
"We enforce groups number == 1 and padding == 0 in " "We enforce groups number == 1 and padding == 0 in "
...@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( ...@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"the number of channels, D is the depth of the feature, H is the " "the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature."); "height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>("strides",
"(vector<int> defalut:{1, 1, 1}), the " "(vector<int> default:{1, 1, 1}), the "
"strides{d_stride, h_stride, w_stride} of " "strides{d_stride, h_stride, w_stride} of "
"convolution transpose operator.") "convolution transpose operator.")
.SetDefault({1, 1, 1}); .SetDefault({1, 1, 1});
AddAttr<std::vector<int>>("paddings", AddAttr<std::vector<int>>("paddings",
"(vector<int> defalut:{0, 0, 0}), paddings(d_pad, " "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
"h_pad, w_pad) of convolution transpose operator.") "h_pad, w_pad) of convolution transpose operator.")
.SetDefault({0, 0, 0}); .SetDefault({0, 0, 0});
AddComment(R"DOC( AddComment(R"DOC(
...@@ -144,23 +151,28 @@ Convolution3D Transpose Operator. ...@@ -144,23 +151,28 @@ Convolution3D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch number of channels, D is the depth of the feature, H is the height of the feature,
size, C is the number of channels, D is the depth of the feature, and W is the width of the feature.
H is the height of the feature, and W is the width of the feature. Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
Parameters(ksize, strides, paddings) are three elements. C is the number of output feature channels, D is the depth of the filter,H is the
These three elements represent depth, height and width, respectively. height of the filter, and W is the width of the filter.
Parameters(strides, paddings) are three elements. These three elements represent
depth, height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, D_in, H_in, W_in) Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
Filter shape: (C_in, C_out, D_f, H_f, W_f) Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, D_out, H_out, W_out) Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
where Where
D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; $$
H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2]; H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
$$
)DOC"); )DOC");
} }
......
...@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
// TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose. // groups will alway be disabled in conv2dtranspose.
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
......
grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "send_recv_impl.h"
namespace paddle {
namespace operators {
namespace detail {
Status SendRecvServerImpl::SendVariable(ServerContext *context,
const VariableMessage *in_var,
VariableMessage *out_var) {
framework::LoDTensor t;
// TODO(typhoonzero): desirealize in_tensor and run pserver network.
std::istringstream iss(in_var->serialized());
framework::DeserializeFromStream(iss, &t);
lodtensor_queue_.Push(std::move(t));
// Block util the sub graph is done.
t = lodtensor_return_queue_.Pop();
std::ostringstream oss;
// FIXME(typhoonzero): get context from op.
framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
std::string *varname = out_var->mutable_varname();
*varname = in_var->varname();
std::string *serialized = out_var->mutable_serialized();
*serialized = oss.str();
return Status::OK;
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "send_recv_impl.h"
namespace paddle {
namespace operators {
namespace detail {
bool RPCClient::SendVariable(const framework::Scope& scope,
const std::string& inname,
const std::string& outname) {
ClientContext context;
VariableMessage msg, out_msg;
// FIXME(typhoonzero): pass device context to here.
auto ctx = platform::CPUDeviceContext();
auto* var = scope.FindVar(inname);
PADDLE_ENFORCE(var);
// TODO(typhoonzero): support SelectedRows
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"Only support LoDTensor, %s has wrong type", inname);
const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
std::ostringstream oss;
framework::SerializeToStream(oss, tensor, ctx);
msg.set_varname(inname);
msg.set_serialized(oss.str());
Status status = stub_->SendVariable(&context, msg, &out_msg);
if (!status.ok()) {
return false;
}
std::istringstream iss(out_msg.serialized());
framework::LoDTensor ret_tensor;
framework::DeserializeFromStream(iss, &ret_tensor);
auto* outvar = scope.FindVar(outname);
framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
// FIXME(typhoonzero): do not copy.
framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
return true;
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto3";
package sendrecv;
service SendRecvService {
// For parameter server round-robin like hashing, do not split tensors.
// Send and recv only one tensor
rpc SendVariable(VariableMessage) returns (VariableMessage) {}
}
// VariableMessage is serialized paddle variable message.
// It can be:
// Tensor
// LoDTensor
// SelectedRows
message VariableMessage {
string varname = 1;
bytes serialized = 2;
}
message VoidMessage {}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/data_type.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/operators/detail/simple_block_queue.h"
// #include <grpc++/channel.h>
// #include <grpc++/client_context.h>
// #include <grpc++/create_channel.h>
// #include <grpc++/security/credentials.h>
#include "paddle/operators/detail/send_recv.grpc.pb.h"
#include "paddle/operators/detail/send_recv.pb.h"
#include <grpc++/grpc++.h>
using grpc::Channel;
using grpc::Server;
using grpc::ServerContext;
using grpc::ServerReader;
using grpc::ServerBuilder;
using grpc::ClientContext;
using grpc::ClientReader;
using grpc::ClientReaderWriter;
using grpc::ClientWriter;
using grpc::Status;
using sendrecv::SendRecvService;
using sendrecv::VariableMessage;
using sendrecv::VoidMessage;
namespace paddle {
namespace operators {
namespace detail {
class SendRecvServerImpl final : public SendRecvService::Service {
public:
explicit SendRecvServerImpl() {}
Status SendVariable(ServerContext *context, const VariableMessage *in_var,
VariableMessage *out_var) override;
const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
void Push(const framework::LoDTensor &tensor) {
this->lodtensor_return_queue_.Push(tensor);
}
private:
SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
};
// RPCClient is a class to send tensors to pserver sub-network
// using different hashing methods.
class RPCClient {
public:
RPCClient(std::shared_ptr<Channel> channel)
: stub_(SendRecvService::NewStub(channel)) {}
bool SendVariable(const framework::Scope &scope, const std::string &inname,
const std::string &outname);
private:
std::unique_ptr<SendRecvService::Stub> stub_;
};
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <deque>
#include <mutex>
namespace paddle {
namespace operators {
namespace detail {
template <typename T>
class SimpleBlockQueue {
private:
std::mutex mutex_;
std::condition_variable condition_;
std::deque<T> queue_;
public:
void Push(T const& value) {
{
std::unique_lock<std::mutex> lock(this->mutex_);
queue_.push_front(value);
}
this->condition_.notify_one();
}
T Pop() {
std::unique_lock<std::mutex> lock(this->mutex_);
this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
T rc(std::move(this->queue_.back()));
this->queue_.pop_back();
return rc;
}
};
} // namespace detail
} // namespace operators
} // namespace paddle
...@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> {
int frame_size = hidden_dims[1]; int frame_size = hidden_dims[1];
math::hl_gru_value<T> gru_value; math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data); gru_value.gate_weight = const_cast<T*>(weight_data);
gru_value.stateWeight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data(); const size_t* order = batch_gate->lod()[2].data();
...@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> {
// to reorder. // to reorder.
ReorderInitState<Place, T>(context.device_context(), *h0, order, ReorderInitState<Place, T>(context.device_context(), *h0, order,
&ordered_h0, true); &ordered_h0, true);
gru_value.prevOutValue = ordered_h0.data<T>(); gru_value.prev_out_value = ordered_h0.data<T>();
} else { } else {
gru_value.prevOutValue = nullptr; gru_value.prev_out_value = nullptr;
} }
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
...@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> {
Tensor gate_t = batch_gate->Slice(bstart, bend); Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend); Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.outputValue = hidden_t.data<T>(); gru_value.output_value = hidden_t.data<T>();
gru_value.gateValue = gate_t.data<T>(); gru_value.gate_value = gate_t.data<T>();
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>(); gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<Place, T>::compute( math::GRUUnitFunctor<Place, T>::compute(
dev_ctx, gru_value, frame_size, cur_batch_size, dev_ctx, gru_value, frame_size, cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")), math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation"))); math::ActiveType(context.Attr<std::string>("gate_activation")));
gru_value.prevOutValue = gru_value.outputValue; gru_value.prev_out_value = gru_value.output_value;
} }
math::Batch2LoDTensorFunctor<Place, T> to_seq; math::Batch2LoDTensorFunctor<Place, T> to_seq;
...@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> {
to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
math::hl_gru_value<T> gru_value; math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data); gru_value.gate_weight = const_cast<T*>(weight_data);
gru_value.stateWeight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
math::hl_gru_grad<T> gru_grad; math::hl_gru_grad<T> gru_grad;
if (weight_grad) { if (weight_grad) {
gru_grad.gateWeightGrad = gru_grad.gate_weight_grad =
weight_grad->mutable_data<T>(context.GetPlace()); weight_grad->mutable_data<T>(context.GetPlace());
zero(dev_ctx, weight_grad, static_cast<T>(0.0)); zero(dev_ctx, weight_grad, static_cast<T>(0.0));
gru_grad.stateWeightGrad = gru_grad.state_weight_grad =
weight_grad->data<T>() + 2 * frame_size * frame_size; weight_grad->data<T>() + 2 * frame_size * frame_size;
} else { } else {
gru_grad.gateWeightGrad = nullptr; gru_grad.gate_weight_grad = nullptr;
gru_grad.stateWeightGrad = nullptr; gru_grad.state_weight_grad = nullptr;
} }
auto batch_starts = batch_hidden_grad.lod()[0]; auto batch_starts = batch_hidden_grad.lod()[0];
...@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> {
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend); Tensor gate_t = batch_gate->Slice(bstart, bend);
gru_value.gateValue = gate_t.data<T>(); gru_value.gate_value = gate_t.data<T>();
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>(); gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend); Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
gru_grad.outputGrad = hidden_grad_t.data<T>(); gru_grad.output_grad = hidden_grad_t.data<T>();
Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend); Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
gru_grad.gateGrad = gate_grad_t.data<T>(); gru_grad.gate_grad = gate_grad_t.data<T>();
Tensor reset_hidden_prev_grad_t = Tensor reset_hidden_prev_grad_t =
batch_reset_hidden_prev_grad.Slice(bstart, bend); batch_reset_hidden_prev_grad.Slice(bstart, bend);
gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>(); gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
if (n == 0) { if (n == 0) {
gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr; gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
gru_grad.prevOutGrad = gru_grad.prev_out_grad =
h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr; h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
} else { } else {
int bstart_pre = static_cast<int>(batch_starts[n - 1]); int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart); Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
gru_value.prevOutValue = hidden_prev_t.data<T>(); gru_value.prev_out_value = hidden_prev_t.data<T>();
Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart); Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>(); gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
} }
math::GRUUnitGradFunctor<Place, T>::compute( math::GRUUnitGradFunctor<Place, T>::compute(
......
...@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase { ...@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
out_var_name); out_var_name);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
framework::DeserializeFromStream(fin, tensor);
uint32_t version;
fin.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
framework::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
fin.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
fin.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(),
std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case framework::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case framework::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case framework::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case framework::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
fin.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
uint64_t lod_level;
fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
fin.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
fin.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
lod[i] = tmp;
}
}
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/log_loss_op.h"
namespace paddle {
namespace operators {
class LogLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predicted"),
"Input(Predicted) must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) must be initialized.");
auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
PADDLE_ENFORCE_EQ(pred_dims, label_dims);
PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
"The rank of Input(Predicted) must be 2 and the shape is "
"[batch_size, 1].");
PADDLE_ENFORCE_EQ(pred_dims[1], 1,
"Each row of Input(Predicted) contains a real value, "
"so the 2nd dimension of Input(X) must be 1.");
ctx->SetOutputDim("Loss", {pred_dims[0], 1});
ctx->ShareLoD("Predicted", "Loss");
}
};
template <typename AttrType>
class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LogLossOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Predicted",
"The input value (Predicted) of Log loss op."
"Predicted is a 2-D tensor with shape [batch_size, 1].");
AddInput("Labels",
"The target value (Labels) of Log loss op."
"Labels is a 2-D tensor with shape [batch_size, 1].");
AddOutput("Loss",
"The output tensor with shape [batch_size, 1] "
"which represents the log loss.");
AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
AddComment(R"DOC(
LogLoss Operator.
Log loss is a loss function used for binary classification. Log Loss quantifies
the accuracy of a classifier by penalising false classifications. Minimising the
Log Loss is equivalent to maximising the accuracy of the classifier. We define
Predicted as the values predicted by our model and Labels as the target ground
truth value. Log loss can evaluate how close the predicted values are to the
target. The shapes of Predicted and Labels are both [batch_size, 1].
The equation is:
$$
Loss = - Labels * log(Predicted + \epsilon) -
(1 - Labels) * log(1 - Predicted + \epsilon)
$$
)DOC");
}
};
class LogLossGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predicted"),
"Input(Predicted) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
"Output(Predicted@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
auto pred_grad_name = framework::GradVarName("Predicted");
ctx->SetOutputDim(pred_grad_name, pred_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
ops::LogLossGradOp);
REGISTER_OP_CPU_KERNEL(log_loss,
ops::LogLossKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/log_loss_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(log_loss,
ops::LogLossKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T, typename AttrType = T>
class LogLossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* loss_out = ctx.Output<Tensor>("Loss");
loss_out->mutable_data<T>(ctx.GetPlace());
auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
auto loss = EigenVector<T>::Flatten(*loss_out);
auto place = ctx.GetEigenDevice<Place>();
loss.device(place) = (-(label * (prediction + epsilon).log()) -
((static_cast<T>(1) - label) *
(static_cast<T>(1) - prediction + epsilon).log()));
}
};
template <typename Place, typename T, typename AttrType = T>
class LogLossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
auto dl = EigenVector<T>::Flatten(*dloss);
auto place = ctx.GetEigenDevice<Place>();
if (dpred) {
dpred->mutable_data<T>(ctx.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dpred);
dx.device(place) = dl * (-(label / (prediction + epsilon)) +
((static_cast<T>(1) - label) /
(static_cast<T>(1) - prediction + epsilon)));
}
}
};
} // namespace operators
} // namespace paddle
...@@ -13,8 +13,9 @@ if(WITH_GPU) ...@@ -13,8 +13,9 @@ if(WITH_GPU)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
...@@ -26,8 +27,9 @@ else() ...@@ -26,8 +27,9 @@ else()
cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(maxouting SRCS maxouting.cc DEPS device_context)
cc_library(unpooling SRCS unpooling.cc DEPS device_context)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
endif() endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
...@@ -25,393 +25,397 @@ namespace detail { ...@@ -25,393 +25,397 @@ namespace detail {
#ifndef __NVCC__ #ifndef __NVCC__
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
T *gateValue, T *resetOutputValue, T *gate_value, T *reset_output_value,
T *prevOutputValue, int frameSize, T *prev_output_value, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
T rValueUpdateGate; T r_value_update_gate;
T rValueResetGate; T r_value_reset_gate;
T rValueResetOutput; T r_value_reset_output;
T rPrevOut = 0; T r_prev_out = 0;
T *updateGate = gateValue; T *update_gate = gate_value;
T *resetGate = gateValue + frameSize; T *reset_gate = gate_value + frame_size;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueResetGate = resetGate[i]; r_value_reset_gate = reset_gate[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = prevOutputValue[i]; r_prev_out = prev_output_value[i];
} }
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
rValueResetOutput, active_gate); r_value_reset_output, active_gate);
updateGate[i] = rValueUpdateGate; update_gate[i] = r_value_update_gate;
resetGate[i] = rValueResetGate; reset_gate[i] = r_value_reset_gate;
resetOutputValue[i] = rValueResetOutput; reset_output_value[i] = r_value_reset_output;
} }
} }
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput, void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
T *gateValue, T *prevOutputValue, T *gate_value, T *prev_output_value,
T *outputValue, int frameSize, T *output_value, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
T rValueUpdateGate; T r_value_update_gate;
T rValueFrameState; T r_value_frame_state;
T rPrevOut = 0; T r_prev_out = 0;
T rOutput; T r_output;
T *updateGate = gateValue; T *update_gate = gate_value;
T *frameState = gateValue + frameSize * 2; T *frame_state = gate_value + frame_size * 2;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueFrameState = frameState[i]; r_value_frame_state = frame_state[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = prevOutputValue[i]; r_prev_out = prev_output_value[i];
} }
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
active_node); r_output, active_node);
frameState[i] = rValueFrameState; frame_state[i] = r_value_frame_state;
outputValue[i] = rOutput; output_value[i] = r_output;
} }
} }
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue, void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
T *resetOutputValue, T *prevOutputValue, T *gate_value, T *reset_output_value,
int frameSize, T *prev_output_value, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueUpdateGate; __m256 r_value_update_gate;
__m256 rValueResetGate; __m256 r_value_reset_gate;
__m256 rValueResetOutput; __m256 r_value_reset_output;
__m256 rPrevOut = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 *updateGate = (__m256 *)gateValue; __m256 *update_gate = (__m256 *)gate_value;
__m256 *resetGate = (__m256 *)(gateValue + frameSize); __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueResetGate = resetGate[i]; r_value_reset_gate = reset_gate[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = ((__m256 *)prevOutputValue)[i]; r_prev_out = ((__m256 *)prev_output_value)[i];
} }
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
rValueResetOutput, active_gate); r_value_reset_output, active_gate);
updateGate[i] = rValueUpdateGate; update_gate[i] = r_value_update_gate;
resetGate[i] = rValueResetGate; reset_gate[i] = r_value_reset_gate;
((__m256 *)resetOutputValue)[i] = rValueResetOutput; ((__m256 *)reset_output_value)[i] = r_value_reset_output;
} }
#endif #endif
} }
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue, void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
T *prevOutputValue, T *outputValue, T *gate_value, T *prev_output_value,
int frameSize, T *output_value, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueUpdateGate; __m256 r_value_update_gate;
__m256 rValueFrameState; __m256 r_value_frame_state;
__m256 rPrevOut = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 rOutput; __m256 r_output;
__m256 *updateGate = (__m256 *)gateValue; __m256 *update_gate = (__m256 *)gate_value;
__m256 *frameState = (__m256 *)(gateValue + frameSize * 2); __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueFrameState = frameState[i]; r_value_frame_state = frame_state[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = ((__m256 *)prevOutputValue)[i]; r_prev_out = ((__m256 *)prev_output_value)[i];
} }
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
active_node); r_output, active_node);
frameState[i] = rValueFrameState; frame_state[i] = r_value_frame_state;
((__m256 *)outputValue)[i] = rOutput; ((__m256 *)output_value)[i] = r_output;
} }
#endif #endif
} }
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
inline void forward_reset_output(OpResetOutput opResetOutput, inline void forward_reset_output(OpResetOutput op_reset_output,
hl_gru_value<T> value, int frameSize, hl_gru_value<T> value, int frame_size,
int batchSize, activation_mode_t active_gate) { int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_gate) {
if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output( hl_avx_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue, op_reset_output, value.gate_value, value.reset_output_value,
value.prevOutValue, frameSize, active_gate); value.prev_out_value, frame_size, active_gate);
} else { } else {
hl_naive_gru_forward_reset_output( hl_naive_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue, op_reset_output, value.gate_value, value.reset_output_value,
value.prevOutValue, frameSize, active_gate); value.prev_out_value, frame_size, active_gate);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
value.resetOutputValue += frameSize; value.reset_output_value += frame_size;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
} }
} }
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
inline void forward_final_output(OpFinalOutput opFinalOutput, inline void forward_final_output(OpFinalOutput op_final_output,
hl_gru_value<T> value, int frameSize, hl_gru_value<T> value, int frame_size,
int batchSize, activation_mode_t active_node) { int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_node) {
if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue, if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
value.prevOutValue, value.outputValue, hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
frameSize, active_node); value.prev_out_value, value.output_value,
frame_size, active_node);
} else { } else {
hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue, hl_naive_gru_forward_final_output(
value.prevOutValue, value.outputValue, op_final_output, value.gate_value, value.prev_out_value,
frameSize, active_node); value.output_value, frame_size, active_node);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
value.outputValue += frameSize; value.output_value += frame_size;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
} }
} }
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue, void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *outputGrad, T *prev_out_grad, T *output_grad,
int frameSize, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
T rUpdateGateValue; T r_update_gate_value;
T rUpdateGateGrad; T r_update_gate_grad;
T rFrameStateValue; T r_frame_state_value;
T rFrameStateGrad; T r_frame_state_grad;
T rOutGrad; T r_out_grad;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T *updateGateValue = gateValue; T *update_gate_value = gate_value;
T *updateGateGrad = gateGrad; T *update_gate_grad = gate_grad;
T *frameStateValue = gateValue + frameSize * 2; T *frame_state_value = gate_value + frame_size * 2;
T *frameStateGrad = gateGrad + frameSize * 2; T *frame_state_grad = gate_grad + frame_size * 2;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rFrameStateValue = frameStateValue[i]; r_frame_state_value = frame_state_value[i];
rOutGrad = outputGrad[i]; r_out_grad = output_grad[i];
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = prevOutValue[i]; r_prev_out_value = prev_out_value[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = prevOutGrad[i]; r_prev_out_grad = prev_out_grad[i];
} }
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
active_node); r_out_grad, active_node);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
frameStateGrad[i] = rFrameStateGrad; frame_state_grad[i] = r_frame_state_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[i] = rPrevOutGrad; prev_out_grad[i] = r_prev_out_grad;
} }
} }
} }
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue, void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *resetOutputGrad, T *prev_out_grad, T *reset_output_grad,
int frameSize, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
T rUpdateGateValue; T r_update_gate_value;
T rUpdateGateGrad; T r_update_gate_grad;
T rResetGateValue; T r_reset_gate_value;
T rResetGateGrad; T r_reset_gate_grad;
T rResetOutputGrad = 0; T r_reset_output_grad = 0;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T *updateGateValue = gateValue; T *update_gate_value = gate_value;
T *updateGateGrad = gateGrad; T *update_gate_grad = gate_grad;
T *resetGateValue = gateValue + frameSize; T *reset_gate_value = gate_value + frame_size;
T *resetGateGrad = gateGrad + frameSize; T *reset_gate_grad = gate_grad + frame_size;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rUpdateGateGrad = updateGateGrad[i]; r_update_gate_grad = update_gate_grad[i];
rResetGateValue = resetGateValue[i]; r_reset_gate_value = reset_gate_value[i];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
rResetOutputGrad = resetOutputGrad[i]; r_reset_output_grad = reset_output_grad[i];
} }
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = prevOutValue[i]; r_prev_out_value = prev_out_value[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = prevOutGrad[i]; r_prev_out_grad = prev_out_grad[i];
} }
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
active_gate); r_reset_output_grad, active_gate);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
resetGateGrad[i] = rResetGateGrad; reset_gate_grad[i] = r_reset_gate_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[i] = rPrevOutGrad; prev_out_grad[i] = r_prev_out_grad;
} }
} }
} }
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue, void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *outputGrad, T *prev_out_grad, T *output_grad,
int frameSize, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rUpdateGateValue; __m256 r_update_gate_value;
__m256 rUpdateGateGrad; __m256 r_update_gate_grad;
__m256 rFrameStateValue; __m256 r_frame_state_value;
__m256 rFrameStateGrad; __m256 r_frame_state_grad;
__m256 rOutGrad; __m256 r_out_grad;
__m256 rPrevOutValue = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue; __m256 *update_gate_value = (__m256 *)gate_value;
__m256 *updateGateGrad = (__m256 *)gateGrad; __m256 *update_gate_grad = (__m256 *)gate_grad;
__m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2); __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
__m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2); __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rFrameStateValue = frameStateValue[i]; r_frame_state_value = frame_state_value[i];
rOutGrad = ((__m256 *)outputGrad)[i]; r_out_grad = ((__m256 *)output_grad)[i];
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = ((__m256 *)prevOutValue)[i]; r_prev_out_value = ((__m256 *)prev_out_value)[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i]; r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
} }
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
active_node); r_out_grad, active_node);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
frameStateGrad[i] = rFrameStateGrad; frame_state_grad[i] = r_frame_state_grad;
if (prevOutGrad) { if (prev_out_grad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad; ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
} }
} }
#endif #endif
} }
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue, void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *resetOutputGrad, T *prev_out_grad, T *reset_output_grad,
int frameSize, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rUpdateGateValue; __m256 r_update_gate_value;
__m256 rUpdateGateGrad; __m256 r_update_gate_grad;
__m256 rResetGateValue; __m256 r_reset_gate_value;
__m256 rResetGateGrad; __m256 r_reset_gate_grad;
__m256 rResetOutputGrad = _mm256_set1_ps(0.0f); __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
__m256 rPrevOutValue = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue; __m256 *update_gate_value = (__m256 *)gate_value;
__m256 *updateGateGrad = (__m256 *)gateGrad; __m256 *update_gate_grad = (__m256 *)gate_grad;
__m256 *resetGateValue = (__m256 *)(gateValue + frameSize); __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
__m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize); __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rUpdateGateGrad = updateGateGrad[i]; r_update_gate_grad = update_gate_grad[i];
rResetGateValue = resetGateValue[i]; r_reset_gate_value = reset_gate_value[i];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
rResetOutputGrad = ((__m256 *)resetOutputGrad)[i]; r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
} }
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = ((__m256 *)prevOutValue)[i]; r_prev_out_value = ((__m256 *)prev_out_value)[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i]; r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
} }
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
active_gate); r_reset_output_grad, active_gate);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
resetGateGrad[i] = rResetGateGrad; reset_gate_grad[i] = r_reset_gate_grad;
if (prevOutGrad) { if (prev_out_grad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad; ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
} }
} }
#endif #endif
} }
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value, inline void backward_state_grad(OpStateGrad op_state_grad,
hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node) { int frame_size, int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_node) {
if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_state_grad( hl_avx_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node); grad.prev_out_grad, grad.output_grad, frame_size, active_node);
} else { } else {
hl_naive_gru_backward_state_grad( hl_naive_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node); grad.prev_out_grad, grad.output_grad, frame_size, active_node);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
grad.gateGrad += frameSize * 3; grad.gate_grad += frame_size * 3;
grad.outputGrad += frameSize; grad.output_grad += frame_size;
if (grad.prevOutGrad) { if (grad.prev_out_grad) {
grad.prevOutGrad += frameSize; grad.prev_out_grad += frame_size;
} }
} }
} }
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value, inline void backward_reset_grad(OpResetGrad op_reset_grad,
hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_gate) { int frame_size, int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_gate) {
if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_reset_grad( hl_avx_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate); grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
} else { } else {
hl_naive_gru_backward_reset_grad( hl_naive_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate); grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
grad.gateGrad += frameSize * 3; grad.gate_grad += frame_size * 3;
grad.resetOutputGrad += frameSize; grad.reset_output_grad += frame_size;
if (grad.prevOutGrad) { if (grad.prev_out_grad) {
grad.prevOutGrad += frameSize; grad.prev_out_grad += frame_size;
} }
} }
} }
......
...@@ -27,174 +27,174 @@ namespace math { ...@@ -27,174 +27,174 @@ namespace math {
namespace detail { namespace detail {
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpResetOutput, bool isBatch, typename T> template <class OpResetOutput, bool is_batch, typename T>
__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput, __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
T *gateValue, T *resetOutputValue, T *gate_value, T *reset_output_value,
T *prevOutputValue, int frameSize, T *prev_output_value, int frame_size,
int batchSize, int batch_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
resetOutputValue += batchIdx * frameSize; reset_output_value += batch_idx * frame_size;
} }
T rPrevOut = 0; T r_prev_out = 0;
T rValueResetOutput; T r_value_reset_output;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0]; T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
T rValueResetGate = gateValue[frameIdx + frameSize * 1]; T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
if (prevOutputValue) { if (prev_output_value) {
if (isBatch) prevOutputValue += batchIdx * frameSize; if (is_batch) prev_output_value += batch_idx * frame_size;
rPrevOut = prevOutputValue[frameIdx]; r_prev_out = prev_output_value[frame_idx];
} }
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
active_gate); r_value_reset_output, active_gate);
gateValue[frameIdx + frameSize * 0] = rValueUpdateGate; gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
gateValue[frameIdx + frameSize * 1] = rValueResetGate; gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
resetOutputValue[frameIdx] = rValueResetOutput; reset_output_value[frame_idx] = r_value_reset_output;
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpFinalOutput, bool isBatch, typename T> template <class OpFinalOutput, bool is_batch, typename T>
__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput, __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
T *gateValue, T *prevOutputValue, T *gate_value, T *prev_output_value,
T *outputValue, int frameSize, T *output_value, int frame_size,
int batchSize, int batch_size,
activation_mode_t active_node) { activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
outputValue += batchIdx * frameSize; output_value += batch_idx * frame_size;
} }
T rOutput; T r_output;
T rPrevOut = 0; T r_prev_out = 0;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0]; T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
T rValueFrameState = gateValue[frameIdx + frameSize * 2]; T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
if (prevOutputValue) { if (prev_output_value) {
if (isBatch) prevOutputValue += batchIdx * frameSize; if (is_batch) prev_output_value += batch_idx * frame_size;
rPrevOut = prevOutputValue[frameIdx]; r_prev_out = prev_output_value[frame_idx];
} }
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
active_node); r_output, active_node);
gateValue[frameIdx + frameSize * 2] = rValueFrameState; gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
outputValue[frameIdx] = rOutput; output_value[frame_idx] = r_output;
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpStateGrad, bool isBatch, typename T> template <class OpStateGrad, bool is_batch, typename T>
__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue, __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *outputGrad, T *prev_out_grad, T *output_grad,
int frameSize, int batchSize, int frame_size, int batch_size,
activation_mode_t active_node) { activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
gateGrad += batchIdx * 3 * frameSize; gate_grad += batch_idx * 3 * frame_size;
outputGrad += batchIdx * frameSize; output_grad += batch_idx * frame_size;
} }
T rUpdateGateGrad; T r_update_gate_grad;
T rFrameStateGrad; T r_frame_state_grad;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0]; T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
T rFrameStateValue = gateValue[frameIdx + frameSize * 2]; T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
T rOutGrad = outputGrad[frameIdx]; T r_out_grad = output_grad[frame_idx];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
if (isBatch) prevOutValue += batchIdx * frameSize; if (is_batch) prev_out_value += batch_idx * frame_size;
rPrevOutValue = prevOutValue[frameIdx]; r_prev_out_value = prev_out_value[frame_idx];
if (isBatch) prevOutGrad += batchIdx * frameSize; if (is_batch) prev_out_grad += batch_idx * frame_size;
rPrevOutGrad = prevOutGrad[frameIdx]; r_prev_out_grad = prev_out_grad[frame_idx];
} }
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
active_node); r_out_grad, active_node);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad; gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad; gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[frameIdx] = rPrevOutGrad; prev_out_grad[frame_idx] = r_prev_out_grad;
} }
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpResetGrad, bool isBatch, typename T> template <class OpResetGrad, bool is_batch, typename T>
__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue, __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *resetOutputGrad, T *prev_out_grad, T *reset_output_grad,
int frameSize, int batchSize, int frame_size, int batch_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
gateGrad += batchIdx * 3 * frameSize; gate_grad += batch_idx * 3 * frame_size;
resetOutputGrad += batchIdx * frameSize; reset_output_grad += batch_idx * frame_size;
} }
T rResetGateGrad; T r_reset_gate_grad;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T rResetOutputGrad = 0; T r_reset_output_grad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0]; T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0]; T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
T rResetGateValue = gateValue[frameIdx + frameSize * 1]; T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
if (isBatch) prevOutValue += batchIdx * frameSize; if (is_batch) prev_out_value += batch_idx * frame_size;
if (isBatch) prevOutGrad += batchIdx * frameSize; if (is_batch) prev_out_grad += batch_idx * frame_size;
rPrevOutValue = prevOutValue[frameIdx]; r_prev_out_value = prev_out_value[frame_idx];
rPrevOutGrad = prevOutGrad[frameIdx]; r_prev_out_grad = prev_out_grad[frame_idx];
rResetOutputGrad = resetOutputGrad[frameIdx]; r_reset_output_grad = reset_output_grad[frame_idx];
} }
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
active_gate); r_reset_output_grad, active_gate);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad; gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
gateGrad[frameIdx + frameSize * 1] = rResetGateGrad; gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[frameIdx] = rPrevOutGrad; prev_out_grad[frame_idx] = r_prev_out_grad;
} }
} }
} // namespace detail } // namespace detail
......
...@@ -28,23 +28,25 @@ namespace forward { ...@@ -28,23 +28,25 @@ namespace forward {
template <typename T> template <typename T>
class gru_resetOutput { class gru_resetOutput {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut, HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
T &valueResetOutput, activation_mode_t actGate) { T &prev_out, T &value_reset_output,
valueUpdateGate = activation(valueUpdateGate, actGate); activation_mode_t act_gate) {
valueResetGate = activation(valueResetGate, actGate); value_update_gate = activation(value_update_gate, act_gate);
valueResetOutput = prevOut * valueResetGate; value_reset_gate = activation(value_reset_gate, act_gate);
value_reset_output = prev_out * value_reset_gate;
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &prevOut, __m256 &valueResetOutput, __m256 &value_reset_gate, __m256 &prev_out,
activation_mode_t actGate) { __m256 &value_reset_output,
valueUpdateGate = activation(valueUpdateGate, actGate); activation_mode_t act_gate) {
valueResetGate = activation(valueResetGate, actGate); value_update_gate = activation(value_update_gate, act_gate);
valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate); value_reset_gate = activation(value_reset_gate, act_gate);
value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
} }
#endif #endif
#endif #endif
...@@ -53,24 +55,26 @@ class gru_resetOutput { ...@@ -53,24 +55,26 @@ class gru_resetOutput {
template <typename T> template <typename T>
class gru_finalOutput { class gru_finalOutput {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut, HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
T &valueOutput, activation_mode_t actInput) { T &prev_out, T &value_output,
valueFrameState = activation(valueFrameState, actInput); activation_mode_t act_input) {
valueOutput = prevOut - (valueUpdateGate * prevOut) + value_frame_state = activation(value_frame_state, act_input);
(valueUpdateGate * valueFrameState); value_output = prev_out - (value_update_gate * prev_out) +
(value_update_gate * value_frame_state);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &prevOut, __m256 &valueOutput, __m256 &value_frame_state, __m256 &prev_out,
activation_mode_t actInput) { __m256 &value_output,
valueFrameState = activation(valueFrameState, actInput); activation_mode_t act_input) {
valueOutput = _mm256_add_ps( value_frame_state = activation(value_frame_state, act_input);
_mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)), value_output = _mm256_add_ps(
_mm256_mul_ps(valueUpdateGate, valueFrameState)); _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
_mm256_mul_ps(value_update_gate, value_frame_state));
} }
#endif #endif
#endif #endif
...@@ -82,34 +86,37 @@ namespace backward { ...@@ -82,34 +86,37 @@ namespace backward {
template <typename T> template <typename T>
class gru_stateGrad { class gru_stateGrad {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate, HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
T &valueFrameState, T &gradFrameState, T &value_frame_state, T &grad_frame_state,
T &valuePrevOut, T &gradPrevOut, T &gradOutput, T &value_prev_out, T &grad_prev_out,
activation_mode_t actInput) { T &grad_output, activation_mode_t act_input) {
gradUpdateGate = (gradOutput * valueFrameState); grad_update_gate = (grad_output * value_frame_state);
gradUpdateGate -= (gradOutput * valuePrevOut); grad_update_gate -= (grad_output * value_prev_out);
gradPrevOut -= (gradOutput * valueUpdateGate); grad_prev_out -= (grad_output * value_update_gate);
gradPrevOut += gradOutput; grad_prev_out += grad_output;
gradFrameState = grad_frame_state = activation(grad_output * value_update_gate,
activation(gradOutput * valueUpdateGate, valueFrameState, actInput); value_frame_state, act_input);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &valueFrameState, __m256 &gradFrameState, __m256 &grad_update_gate,
__m256 &valuePrevOut, __m256 &gradPrevOut, __m256 &value_frame_state,
__m256 &gradOutput, activation_mode_t actInput) { __m256 &grad_frame_state, __m256 &value_prev_out,
gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState); __m256 &grad_prev_out, __m256 &grad_output,
gradUpdateGate = activation_mode_t act_input) {
_mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut)); grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
gradPrevOut = _mm256_add_ps( grad_update_gate = _mm256_sub_ps(
_mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)), grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
gradOutput); grad_prev_out = _mm256_add_ps(
gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate), _mm256_sub_ps(grad_prev_out,
valueFrameState, actInput); _mm256_mul_ps(grad_output, value_update_gate)),
grad_output);
grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
value_frame_state, act_input);
} }
#endif #endif
#endif #endif
...@@ -118,30 +125,32 @@ class gru_stateGrad { ...@@ -118,30 +125,32 @@ class gru_stateGrad {
template <typename T> template <typename T>
class gru_resetGrad { class gru_resetGrad {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate, HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
T &valueResetGate, T &gradResetGate, T &value_reset_gate, T &grad_reset_gate,
T &valuePrevOut, T &gradPrevOut, T &value_prev_out, T &grad_prev_out,
T &gradResetOutput, activation_mode_t actGate) { T &grad_reset_output, activation_mode_t act_gate) {
gradResetGate = (gradResetOutput * valuePrevOut); grad_reset_gate = (grad_reset_output * value_prev_out);
gradPrevOut += (gradResetOutput * valueResetGate); grad_prev_out += (grad_reset_output * value_reset_gate);
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate); grad_update_gate =
gradResetGate = activation(gradResetGate, valueResetGate, actGate); activation(grad_update_gate, value_update_gate, act_gate);
grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &valueResetGate, __m256 &gradResetGate, __m256 &grad_update_gate, __m256 &value_reset_gate,
__m256 &valuePrevOut, __m256 &gradPrevOut, __m256 &grad_reset_gate, __m256 &value_prev_out,
__m256 &gradResetOutput, __m256 &grad_prev_out, __m256 &grad_reset_output,
activation_mode_t actGate) { activation_mode_t act_gate) {
gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut); grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
gradPrevOut = _mm256_add_ps(gradPrevOut, grad_prev_out = _mm256_add_ps(
_mm256_mul_ps(gradResetOutput, valueResetGate)); grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate); grad_update_gate =
gradResetGate = activation(gradResetGate, valueResetGate, actGate); activation(grad_update_gate, value_update_gate, act_gate);
grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
} }
#endif #endif
#endif #endif
......
...@@ -21,29 +21,29 @@ namespace math { ...@@ -21,29 +21,29 @@ namespace math {
template <typename T> template <typename T>
struct GRUUnitFunctor<platform::CPUPlace, T> { struct GRUUnitFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize, hl_gru_value<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1, context, false, false, batch_size, frame_size * 2, frame_size, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
value.gateValue, frameSize * 3); 1, value.gate_value, frame_size * 3);
} }
detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value, detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
frameSize, batchSize, active_gate); frame_size, batch_size, active_gate);
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1, context, false, false, batch_size, frame_size, frame_size, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1, value.reset_output_value, frame_size, value.state_weight, frame_size,
value.gateValue + frameSize * 2, frameSize * 3); 1, value.gate_value + frame_size * 2, frame_size * 3);
} }
detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value, detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
frameSize, batchSize, active_node); frame_size, batch_size, active_node);
#endif #endif
} }
}; };
...@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> { ...@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
template <typename T> template <typename T>
struct GRUUnitGradFunctor<platform::CPUPlace, T> { struct GRUUnitGradFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value, detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frameSize, batchSize, active_node); grad, frame_size, batch_size, active_node);
if (value.prevOutValue && grad.prevOutGrad) { if (value.prev_out_value && grad.prev_out_grad) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1, context, false, true, batch_size, frame_size, frame_size, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
frameSize, 0, grad.resetOutputGrad, frameSize); frame_size, 0, grad.reset_output_grad, frame_size);
if (grad.stateWeightGrad) { if (grad.state_weight_grad) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1, context, true, false, frame_size, frame_size, batch_size, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2, value.reset_output_value, frame_size,
frameSize * 3, 1, grad.stateWeightGrad, frameSize); grad.gate_grad + frame_size * 2, frame_size * 3, 1,
grad.state_weight_grad, frame_size);
} }
} }
detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value, detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
grad, frameSize, batchSize, active_gate); grad, frame_size, batch_size, active_gate);
if (grad.prevOutGrad && value.prevOutValue) { if (grad.prev_out_grad && value.prev_out_value) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1, context, false, true, batch_size, frame_size, frame_size * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
grad.prevOutGrad, frameSize); grad.prev_out_grad, frame_size);
if (grad.gateWeightGrad) { if (grad.gate_weight_grad) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1, context, true, false, frame_size, frame_size * 2, batch_size, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
grad.gateWeightGrad, frameSize * 2); grad.gate_weight_grad, frame_size * 2);
} }
} }
#endif #endif
......
...@@ -21,66 +21,66 @@ namespace math { ...@@ -21,66 +21,66 @@ namespace math {
template <typename T> template <typename T>
struct GRUUnitFunctor<platform::GPUPlace, T> { struct GRUUnitFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize, hl_gru_value<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1, context, false, false, batch_size, frame_size * 2, frame_size, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
value.gateValue, frameSize * 3); 1, value.gate_value, frame_size * 3);
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>, detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ false, /* is_batch= */ false,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue, detail::forward::gru_resetOutput<T>(), value.gate_value,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize, value.reset_output_value, value.prev_out_value, frame_size,
active_gate); batch_size, active_gate);
} else { } else {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>, detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ true, /* is_batch= */ true,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue, detail::forward::gru_resetOutput<T>(), value.gate_value,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize, value.reset_output_value, value.prev_out_value, frame_size,
active_gate); batch_size, active_gate);
} }
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1, context, false, false, batch_size, frame_size, frame_size, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1, value.reset_output_value, frame_size, value.state_weight, frame_size,
value.gateValue + frameSize * 2, frameSize * 3); 1, value.gate_value + frame_size * 2, frame_size * 3);
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>, detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ false, /* is_batch= */ false,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue, detail::forward::gru_finalOutput<T>(), value.gate_value,
value.prevOutValue, value.outputValue, frameSize, batchSize, value.prev_out_value, value.output_value, frame_size, batch_size,
active_node); active_node);
} else { } else {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>, detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ true, /* is_batch= */ true,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue, detail::forward::gru_finalOutput<T>(), value.gate_value,
value.prevOutValue, value.outputValue, frameSize, batchSize, value.prev_out_value, value.output_value, frame_size, batch_size,
active_node); active_node);
} }
} }
...@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> { ...@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
template <typename T> template <typename T>
struct GRUUnitGradFunctor<platform::GPUPlace, T> { struct GRUUnitGradFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruBackwardStateGrad< detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>, detail::backward::gru_stateGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_stateGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_node); grad.output_grad, frame_size, batch_size, active_node);
} else { } else {
detail::KeGruBackwardStateGrad< detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>, detail::backward::gru_stateGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_stateGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_node); grad.output_grad, frame_size, batch_size, active_node);
} }
if (value.prevOutValue && grad.prevOutGrad) { if (value.prev_out_value && grad.prev_out_grad) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1, context, false, true, batch_size, frame_size, frame_size, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
frameSize, 0, grad.resetOutputGrad, frameSize); frame_size, 0, grad.reset_output_grad, frame_size);
if (grad.stateWeightGrad) { if (grad.state_weight_grad) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1, context, true, false, frame_size, frame_size, batch_size, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2, value.reset_output_value, frame_size,
frameSize * 3, 1, grad.stateWeightGrad, frameSize); grad.gate_grad + frame_size * 2, frame_size * 3, 1,
grad.state_weight_grad, frame_size);
} }
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruBackwardResetGrad< detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>, detail::backward::gru_resetGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_resetGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_gate); grad.reset_output_grad, frame_size, batch_size, active_gate);
} else { } else {
detail::KeGruBackwardResetGrad< detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>, detail::backward::gru_resetGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_resetGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_gate); grad.reset_output_grad, frame_size, batch_size, active_gate);
} }
if (grad.prevOutGrad && value.prevOutValue) { if (grad.prev_out_grad && value.prev_out_value) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1, context, false, true, batch_size, frame_size, frame_size * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
grad.prevOutGrad, frameSize); grad.prev_out_grad, frame_size);
if (grad.gateWeightGrad) { if (grad.gate_weight_grad) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1, context, true, false, frame_size, frame_size * 2, batch_size, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
grad.gateWeightGrad, frameSize * 2); grad.gate_weight_grad, frame_size * 2);
} }
} }
} }
......
...@@ -22,28 +22,28 @@ namespace math { ...@@ -22,28 +22,28 @@ namespace math {
// TODO(guosheng): refine code style in gru_compute // TODO(guosheng): refine code style in gru_compute
template <typename T> template <typename T>
struct hl_gru_value { struct hl_gru_value {
T *gateWeight; T *gate_weight;
T *stateWeight; T *state_weight;
T *gateValue; T *gate_value;
T *resetOutputValue; T *reset_output_value;
T *outputValue; T *output_value;
T *prevOutValue; T *prev_out_value;
}; };
template <typename T> template <typename T>
struct hl_gru_grad { struct hl_gru_grad {
T *gateWeightGrad; T *gate_weight_grad;
T *stateWeightGrad; T *state_weight_grad;
T *gateGrad; T *gate_grad;
T *resetOutputGrad; T *reset_output_grad;
T *outputGrad; T *output_grad;
T *prevOutGrad; T *prev_out_grad;
}; };
template <typename Place, typename T> template <typename Place, typename T>
struct GRUUnitFunctor { struct GRUUnitFunctor {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize, hl_gru_value<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate); activation_mode_t active_gate);
}; };
...@@ -51,8 +51,9 @@ struct GRUUnitFunctor { ...@@ -51,8 +51,9 @@ struct GRUUnitFunctor {
template <typename Place, typename T> template <typename Place, typename T>
struct GRUUnitGradFunctor { struct GRUUnitGradFunctor {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate); activation_mode_t active_gate);
}; };
......
...@@ -23,8 +23,7 @@ template <typename T> ...@@ -23,8 +23,7 @@ template <typename T>
class MaxOutFunctor<platform::CPUPlace, T> { class MaxOutFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input, framework::Tensor* output,
framework::Tensor * output,
int groups) { int groups) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -43,28 +42,24 @@ class MaxOutFunctor<platform::CPUPlace, T> { ...@@ -43,28 +42,24 @@ class MaxOutFunctor<platform::CPUPlace, T> {
for (int f = 0; f < fea_size; ++f) { for (int f = 0; f < fea_size; ++f) {
T ele = static_cast<T>(-FLT_MAX); T ele = static_cast<T>(-FLT_MAX);
for (int ph = 0; ph < groups; ++ph) { for (int ph = 0; ph < groups; ++ph) {
T x = input_data[(new_bindex + new_cindex) * groups T x = input_data[(new_bindex + new_cindex) * groups +
+ ph * fea_size + f]; ph * fea_size + f];
ele = ele > x ? ele : x; ele = ele > x ? ele : x;
} }
output_data[(new_bindex+new_cindex+f)] = ele; output_data[(new_bindex + new_cindex + f)] = ele;
} }
} }
} }
} }
}; };
template <class T> template <class T>
class MaxOutGradFunctor<platform::CPUPlace, T> { class MaxOutGradFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input, framework::Tensor* input_grad,
framework::Tensor * input_grad,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, const framework::Tensor& output_grad, int groups) {
int groups) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
......
...@@ -21,9 +21,9 @@ namespace math { ...@@ -21,9 +21,9 @@ namespace math {
template <typename T> template <typename T>
__global__ void KernelMaxOut(const int nthreads, const T* input_data, __global__ void KernelMaxOut(const int nthreads, const T* input_data,
const int channels, const int channels, const int input_height,
const int input_height, const int input_width, const int input_width, int groups,
int groups, T* output_data ) { T* output_data) {
const int size = input_height * input_width * channels / groups; const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width; const int feat_len = input_height * input_width;
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -44,10 +44,11 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data, ...@@ -44,10 +44,11 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
} }
} }
template <typename T> template <typename T>
__global__ void KernelMaxoutGrad( __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
const int nthreads, const T* input_data, const T* output_data, const T* output_data, const T* output_grad,
const T* output_grad, T* input_grad, const int channels, T* input_grad, const int channels,
const int input_height, const int input_width, int groups) { const int input_height, const int input_width,
int groups) {
const int size = input_height * input_width * channels / groups; const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width; const int feat_len = input_height * input_width;
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -80,7 +81,7 @@ template <typename T> ...@@ -80,7 +81,7 @@ template <typename T>
class MaxOutFunctor<platform::GPUPlace, T> { class MaxOutFunctor<platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor * output, const framework::Tensor& input, framework::Tensor* output,
int groups) { int groups) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -101,8 +102,7 @@ class MaxOutFunctor<platform::GPUPlace, T> { ...@@ -101,8 +102,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
T><<<grid, threads, 0, T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(nthreads, input_data, input_channels, .stream()>>>(nthreads, input_data, input_channels,
input_height, input_width, groups, input_height, input_width, groups, output_data);
output_data);
} }
}; };
/* /*
...@@ -112,11 +112,9 @@ template <typename T> ...@@ -112,11 +112,9 @@ template <typename T>
class MaxOutGradFunctor<platform::GPUPlace, T> { class MaxOutGradFunctor<platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input, framework::Tensor* input_grad,
framework::Tensor * input_grad,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, const framework::Tensor& output_grad, int groups) {
int groups) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -137,9 +135,9 @@ class MaxOutGradFunctor<platform::GPUPlace, T> { ...@@ -137,9 +135,9 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
KernelMaxoutGrad< KernelMaxoutGrad<
T><<<grid, threads, 0, T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(nthreads, input_data, output_data,
nthreads, input_data, output_data, output_grad_data, input_grad_data, output_grad_data, input_grad_data, input_channels,
input_channels, input_height, input_width, groups); input_height, input_width, groups);
} }
}; };
......
...@@ -21,15 +21,14 @@ namespace paddle { ...@@ -21,15 +21,14 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#define FLT_MAX \ #define FLT_MAX __FLT_MAX__
__FLT_MAX__
template <typename Place, typename T> template <typename Place, typename T>
class MaxOutFunctor { class MaxOutFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor * output, const framework::Tensor& input, framework::Tensor* output,
int groups); int groups);
}; };
...@@ -37,8 +36,7 @@ template <typename Place, class T> ...@@ -37,8 +36,7 @@ template <typename Place, class T>
class MaxOutGradFunctor { class MaxOutGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input, framework::Tensor* input_grad,
framework::Tensor * input_grad,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, int groups); const framework::Tensor& output_grad, int groups);
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/unpooling.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class Unpool2dMaxFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
int input_feasize = input_height * input_width;
int output_feasize = output_height * output_width;
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace());
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
int index = indices_data[i];
PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
output_data[index] = input_data[i];
}
input_data += input_feasize;
indices_data += input_feasize;
output_data += output_feasize;
}
}
}
};
template <class T>
class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices,
const framework::Tensor& output,
const framework::Tensor& output_grad,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
int input_feasize = input_height * input_width;
int output_feasize = output_height * output_width;
const int* indices_data = indices.data<int>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
int index = indices_data[i];
PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
input_grad_data[i] = output_grad_data[index];
}
input_grad_data += input_feasize;
indices_data += input_feasize;
output_grad_data += output_feasize;
}
}
}
};
template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/unpooling.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
const int* indices_data,
const int input_height, const int input_width,
const int channels, T* output_data,
const int output_height,
const int output_width) {
int in_n_stride = input_height * input_width * channels;
int in_c_stride = input_height * input_width;
int out_n_stride = output_height * output_width * channels;
int out_c_stride = output_height * output_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int bidx = i / in_n_stride;
int boffset = i % in_n_stride;
int cidx = boffset / in_c_stride;
int out_offset = bidx * out_n_stride + cidx * out_c_stride;
int out_index = indices_data[i];
PADDLE_ASSERT(out_index < out_c_stride);
output_data[out_offset + out_index] = input_data[i];
}
}
template <typename T>
__global__ void KernelUnpool2dMaxGrad(
const int nthreads, const T* input_data, const int* indices_data,
const int input_height, const int input_width, const int channels,
const T* output_data, const T* output_grad, const int output_height,
const int output_width, T* input_grad) {
int in_n_stride = input_height * input_width * channels;
int in_c_stride = input_height * input_width;
int out_n_stride = output_height * output_width * channels;
int out_c_stride = output_height * output_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int bidx = i / in_n_stride;
int boffset = i % in_n_stride;
int cidx = boffset / in_c_stride;
int out_offset = bidx * out_n_stride + cidx * out_c_stride;
int out_index = indices_data[i];
PADDLE_ASSERT(out_index < out_c_stride);
input_grad[i] = output_grad[out_offset + out_index];
}
}
/*
* All tensors are in NCHW format.
*/
template <typename T>
class Unpool2dMaxFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int threads = 1024;
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMax<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input.numel(), input_data, indices_data,
input_height, input_width, output_channels,
output_data, output_height, output_width);
}
};
/*
* All tensors are in NCHW format.
*/
template <typename T>
class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices,
const framework::Tensor& output,
const framework::Tensor& output_grad,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int threads = 1024;
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMaxGrad<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input.numel(), input_data, indices_data,
input_height, input_width, output_channels,
output_data, output_grad_data, output_height,
output_width, input_grad_data);
}
};
template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
template <typename Place, typename T>
class Unpool2dMaxFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices, framework::Tensor* output);
};
template <typename Place, class T>
class Unpool2dMaxGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices,
const framework::Tensor& output,
const framework::Tensor& output_grad,
framework::Tensor* input_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -22,7 +22,8 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -22,7 +22,8 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput(
"X",
"(Tensor) The input tensor of maxout operator. " "(Tensor) The input tensor of maxout operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the " "The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature."); "number of channels, H and W is the height and width of feature.");
...@@ -59,21 +60,19 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -59,21 +60,19 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
class MaxOutOp : public framework::OperatorWithKernel { class MaxOutOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp" PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of MaxoutOp"
"should not be null."); "should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MaxoutOp should not be null."); "Output(Out) of MaxoutOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X"); auto in_x_dims = ctx->GetInputDim("X");
int groups = ctx->Attrs().Get<int>("groups"); int groups = ctx->Attrs().Get<int>("groups");
// check groups > 1 // check groups > 1
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
groups, 1,
"groups should be larger than 1 in maxoutop");
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups}); std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
output_shape.push_back(in_x_dims[2]); output_shape.push_back(in_x_dims[2]);
output_shape.push_back(in_x_dims[3]); output_shape.push_back(in_x_dims[3]);
...@@ -97,8 +96,7 @@ class MaxOutOpGrad : public framework::OperatorWithKernel { ...@@ -97,8 +96,7 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad, REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
ops::MaxOutOpGrad); ops::MaxOutOpGrad);
REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace, REGISTER_OP_CPU_KERNEL(maxout,
float>); ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(maxout_grad, REGISTER_OP_CPU_KERNEL(
ops::MaxOutGradKernel<paddle::platform::CPUPlace, maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
float>);
...@@ -18,8 +18,6 @@ namespace ops = paddle::operators; ...@@ -18,8 +18,6 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(maxout, REGISTER_OP_GPU_KERNEL(maxout,
ops::MaxOutKernel<paddle::platform::GPUPlace, float>, ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
ops::MaxOutKernel<paddle::platform::GPUPlace, double>); ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(maxout_grad, REGISTER_OP_GPU_KERNEL(
ops::MaxOutGradKernel<paddle::platform::GPUPlace, maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
float>, ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
ops::MaxOutGradKernel<paddle::platform::GPUPlace,
double>);
...@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, ...@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int>, defalut {0,0}), paddings(height, width) of pooling " "(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator." "operator."
"If global_pooling = true, paddings and ksize will be ignored.") "If global_pooling = true, paddings and ksize will be ignored.")
.SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -127,10 +127,10 @@ Example: ...@@ -127,10 +127,10 @@ Example:
X shape: $(N, C, H_{in}, W_{in})$ X shape: $(N, C, H_{in}, W_{in})$
Output: Output:
Out shape: $(N, C, H_{out}, W_{out})$ Out shape: $(N, C, H_{out}, W_{out})$
where Where
$$ $$
H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
$$ $$
)DOC"); )DOC");
...@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto, ...@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int>, defalut {0,0,0}), paddings(depth, height, " "(vector<int>, default {0,0,0}), paddings(depth, height, "
"width) of pooling operator. " "width) of pooling operator. "
"If global_pooling = true, ksize and paddings will be ignored.") "If global_pooling = true, ksize and paddings will be ignored.")
.SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -199,11 +199,11 @@ Example: ...@@ -199,11 +199,11 @@ Example:
X shape: $(N, C, D_{in}, H_{in}, W_{in})$ X shape: $(N, C, D_{in}, H_{in}, W_{in})$
Output: Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
where Where
$$ $$
D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1 W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$ $$
)DOC"); )DOC");
......
...@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling " "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
"operator. " "operator. "
"If global_pooling = true, paddings and will be ignored.") "If global_pooling = true, paddings and will be ignored.")
.SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -166,10 +166,10 @@ Example: ...@@ -166,10 +166,10 @@ Example:
Output: Output:
Out shape: $(N, C, H_{out}, W_{out})$ Out shape: $(N, C, H_{out}, W_{out})$
Mask shape: $(N, C, H_{out}, W_{out})$ Mask shape: $(N, C, H_{out}, W_{out})$
where Where
$$ $$
H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
$$ $$
)DOC"); )DOC");
...@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector, defalut {0,0,0}), paddings(depth, " "(vector, default {0,0,0}), paddings(depth, "
"height, width) of pooling operator. " "height, width) of pooling operator. "
"If global_pooling = true, paddings and ksize will be ignored.") "If global_pooling = true, paddings and ksize will be ignored.")
.SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -244,11 +244,11 @@ Example: ...@@ -244,11 +244,11 @@ Example:
Output: Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$ Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
where Where
$$ $$
D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1 W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$ $$
)DOC"); )DOC");
......
...@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel { ...@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
auto right_dims = ctx->GetInputDim("Right"); auto right_dims = ctx->GetInputDim("Right");
PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims), PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
"All inputs must have the same size"); "All inputs must have the same size.");
PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1), PADDLE_ENFORCE(
"All inputs must be row vector with size batch_size x 1."); (label_dims.size() == 2) && (label_dims[1] == 1),
"All inputs must be 2-D tensors with shape [batch_size x 1].");
ctx->SetOutputDim("Out", label_dims); ctx->SetOutputDim("Out", label_dims);
} }
}; };
...@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Label", AddInput("Label",
"The label indicating A ranked higher than B or not, row vector."); "(2-D Tensor with shape [batch_size x 1]) "
AddInput("Left", "The output of RankNet for doc A, vector."); "The label indicating A ranked higher than B or not.");
AddInput("Right", "The output of RankNet for doc B, vetor."); AddInput("Left",
AddOutput("Out", "The output loss of RankLoss operator, vector."); "(2-D Tensor with shape [batch_size x 1]) "
"The output of RankNet for doc A.");
AddInput("Right",
"(2-D Tensor with shape [batch_size x 1]) "
"The output of RankNet for doc B.");
AddOutput("Out",
"(2-D Tensor with shape [batch_size x 1]) "
"The output loss of RankLoss operator.");
AddComment(R"DOC( AddComment(R"DOC(
RankLoss Operator. RankLoss Operator.
...@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of ...@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
the input pair. the input pair.
The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
(P_{i,j}), which represent the output of RankNet for the two docs and the label, (P_{i,j}), which represent the output score of RankNet for the two docs and
respectively, and yields the rank loss C_{i,j} using the following equation: the label respectively, and yields the rank loss C_{i,j} using the following
equation:
\f$$ $$
C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\ C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
o_{i,j} = o_i - o_j \\ o_{i,j} = o_i - o_j \\
\tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
\f$$ $$
The operator can take inputs of one sample or in batch. The operator can take batch inputs with size batch_size (batch_size >= 1).
)DOC"); )DOC");
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream>
#include <thread>
#include <unistd.h>
#include "paddle/framework/data_type.h"
#include "paddle/framework/executor.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h"
namespace paddle {
namespace operators {
void RunServer(Server **rpc_server,
std::shared_ptr<detail::SendRecvServerImpl> service,
const std::string &server_address) {
ServerBuilder builder;
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
builder.RegisterService(service.get());
std::unique_ptr<Server> server(builder.BuildAndStart());
*rpc_server = server.get();
LOG(INFO) << "Server listening on " << server_address << std::endl;
server->Wait();
}
class RecvOp : public framework::OperatorBase {
public:
RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
rpc_service_.reset(new detail::SendRecvServerImpl());
std::string endpoint = Attr<std::string>("endpoint");
server_thread_.reset(
new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
}
}
virtual ~RecvOp() {
rpc_server_->Shutdown();
server_thread_->join();
}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
// blocking get one var from client.
const framework::LoDTensor &t = rpc_service_->Get();
framework::Scope &recv_scope = scope.NewScope();
// set graph input var
auto *var = recv_scope.Var(Input("RX"));
auto *tensor = var->GetMutable<framework::LoDTensor>();
// FIXME(typhoonzero): do not copy
framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
auto *program = block->Program();
framework::Executor executor(dev_ctx);
// Run sub graph to get optimized tensor
executor.Run(*program, &recv_scope, block->ID(),
false /*create_local_scope*/);
auto *out_var = recv_scope.FindVar("Out");
// push back
rpc_service_->Push(out_var->Get<framework::LoDTensor>());
}
protected:
// grpc server instance to track status and gracefully shutdown.
// borrow an pointer from server thread.
Server *rpc_server_{nullptr};
// grpc send/recv service implement to register.
std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
};
class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("RX", "(Tensor) Input tensor to be saved");
AddComment(R"DOC(
Recv operator
This operator will recv tensor from send_op
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
"optimize network run in server");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
...@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel { ...@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
// TODO(qiao) change batch_size // TODO(qiao) change batch_size
for (size_t i = 1; i < shape.size(); ++i) { for (size_t i = 1; i < shape.size(); ++i) {
PADDLE_ENFORCE(shape[i] > 0, PADDLE_ENFORCE(shape[i] > 0,
"Each dimension of shape " "Each dimension of Attr(shape) "
"must be positiv except the first."); "must be positive except the first one.");
} }
if (shape[0] < 0) { if (shape[0] < 0) {
shape[0] = x_dims[0]; shape[0] = x_dims[0];
......
...@@ -125,7 +125,8 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -125,7 +125,8 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor), " "(Tensor), "
"Argmaxes corresponding to indices in X used " "Argmaxes corresponding to indices in X used "
"for gradient computation. Only output " "for gradient computation. Only output "
"if arg “is_test” is false.").AsIntermediate(); "if arg “is_test” is false.")
.AsIntermediate();
AddAttr<float>("spatial_scale", AddAttr<float>("spatial_scale",
"(float, default 1.0), " "(float, default 1.0), "
"Multiplicative spatial scale factor " "Multiplicative spatial scale factor "
...@@ -153,11 +154,10 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn ...@@ -153,11 +154,10 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
roi_pool_grad, ops::ROIPoolGradOp); ops::ROIPoolGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_pool, roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>); ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_pool_grad, roi_pool_grad,
......
...@@ -29,12 +29,14 @@ static inline int NumBlocks(const int N) { ...@@ -29,12 +29,14 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks); kNumMaxinumNumBlocks);
} }
template <typename T> template <typename T>
__global__ void GPUROIPoolForward( __global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
const int nthreads, const T* input_data, const int64_t* input_rois, const int64_t* input_rois,
const float spatial_scale, const int channels, const int height, const float spatial_scale, const int channels,
const int width, const int pooled_height, const int pooled_width, const int height, const int width,
T* output_data, int64_t* argmax_data) { const int pooled_height,
const int pooled_width, T* output_data,
int64_t* argmax_data) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x; int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) { for (size_t i = index; i < nthreads; i += offset) {
...@@ -84,22 +86,14 @@ static inline int NumBlocks(const int N) { ...@@ -84,22 +86,14 @@ static inline int NumBlocks(const int N) {
argmax_data[index] = maxidx; argmax_data[index] = maxidx;
} }
} }
} }
template <typename T> template <typename T>
__global__ void GPUROIPoolBackward( __global__ void GPUROIPoolBackward(
const int nthreads, const int nthreads, const int64_t* input_rois, const T* output_grad,
const int64_t* input_rois, const int64_t* argmax_data, const int num_rois, const float spatial_scale,
const T* output_grad, const int channels, const int height, const int width,
const int64_t* argmax_data, const int pooled_height, const int pooled_width, T* input_grad) {
const int num_rois,
const float spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
T* input_grad) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x; int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) { for (int i = index; i < nthreads; i += offset) {
...@@ -118,12 +112,12 @@ __global__ void GPUROIPoolBackward( ...@@ -118,12 +112,12 @@ __global__ void GPUROIPoolBackward(
int argmax = offset_argmax_data[ph * pooled_width + pw]; int argmax = offset_argmax_data[ph * pooled_width + pw];
if (argmax != -1) { if (argmax != -1) {
platform::CudaAtomicAdd(offset_input_grad + argmax, platform::CudaAtomicAdd(
offset_input_grad + argmax,
static_cast<T>(offset_output_grad[ph * pooled_width + pw])); static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
} }
} }
} }
template <typename Place, typename T> template <typename Place, typename T>
class GPUROIPoolOpKernel : public framework::OpKernel<T> { class GPUROIPoolOpKernel : public framework::OpKernel<T> {
...@@ -145,23 +139,16 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -145,23 +139,16 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
int width = in_dims[3]; int width = in_dims[3];
size_t rois_num = rois->dims()[0]; size_t rois_num = rois->dims()[0];
if (rois_num== 0) return; if (rois_num == 0) return;
int output_size = out->numel(); int output_size = out->numel();
int blocks = NumBlocks(output_size); int blocks = NumBlocks(output_size);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
GPUROIPoolForward<T> GPUROIPoolForward<
<<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_size, output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
in->data<T>(), channels, height, width, pooled_height, pooled_width,
rois->data<int64_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
out->mutable_data<T>(ctx.GetPlace()), out->mutable_data<T>(ctx.GetPlace()),
argmax->mutable_data<int64_t>(ctx.GetPlace())); argmax->mutable_data<int64_t>(ctx.GetPlace()));
} }
...@@ -175,10 +162,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -175,10 +162,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
auto* rois = ctx.Input<Tensor>("ROIs"); auto* rois = ctx.Input<Tensor>("ROIs");
auto* argmax = ctx.Input<Tensor>("Argmax"); auto* argmax = ctx.Input<Tensor>("Argmax");
auto* out_grad = auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
ctx.Input<Tensor>(framework::GradVarName("Out")); auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* x_grad =
ctx.Output<Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height"); auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width"); auto pooled_width = ctx.Attr<int>("pooled_width");
...@@ -199,19 +184,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -199,19 +184,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
if (output_grad_size > 0) { if (output_grad_size > 0) {
GPUROIPoolBackward<T> GPUROIPoolBackward<
<<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_grad_size, output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
rois->data<int64_t>(), argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
out_grad->data<T>(), width, pooled_height, pooled_width,
argmax->data<int64_t>(),
rois_num,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
x_grad->mutable_data<T>(ctx.GetPlace())); x_grad->mutable_data<T>(ctx.GetPlace()));
} }
} }
...@@ -223,8 +200,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -223,8 +200,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
roi_pool, roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>); ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
roi_pool_grad, roi_pool_grad,
......
...@@ -133,54 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -133,54 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::Tensor>("ROIs"); auto* rois = ctx.Input<framework::Tensor>("ROIs");
auto* argmax = ctx.Input<framework::Tensor>("Argmax"); auto* argmax = ctx.Input<framework::Tensor>("Argmax");
auto* out_grad = auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out")); ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* x_grad = auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height"); auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width"); auto pooled_width = ctx.Attr<int>("pooled_width");
if (x_grad) { if (in_grad) {
int channels = in->dims()[1];
auto in_stride = framework::stride(in->dims());
auto roi_stride = framework::stride(rois->dims());
const int64_t* rois_data = rois->data<int64_t>(); const int64_t* rois_data = rois->data<int64_t>();
int rois_num = rois->dims()[0]; const T* out_grad_data = out_grad->data<T>();
const int64_t* argmax_data = argmax->data<int64_t>();
T* x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace()); T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
math::SetConstant<Place, T> set_zero; math::SetConstant<Place, T> set_zero;
set_zero(ctx.device_context(), x_grad, static_cast<T>(0)); set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
size_t roi_offset = roi_stride[0]; auto in_stride = framework::stride(in->dims());
size_t batch_offset = in_stride[0]; auto argmax_stride = framework::stride(argmax->dims());
size_t channel_offset = in_stride[1]; auto roi_stride = framework::stride(rois->dims());
auto out_stride = framework::stride(out_grad->dims());
const T* out_grad_data = out_grad->data<T>(); int rois_num = rois->dims()[0];
size_t pool_channel_offset = pooled_height * pooled_width; int channels = in->dims()[1];
const int64_t* argmax_data = argmax->data<int64_t>();
for (size_t n = 0; n < rois_num; ++n) { for (int n = 0; n < rois_num; ++n) {
size_t roi_batch_idx = rois_data[0]; int roi_batch_idx = rois_data[0];
T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx; T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) { for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) { for (int pw = 0; pw < pooled_width; ++pw) {
size_t pool_index = ph * pooled_width + pw; int pool_index = ph * pooled_width + pw;
if (argmax_data[pool_index] >= 0) { if (argmax_data[pool_index] >= 0) {
size_t index = static_cast<size_t>(argmax_data[pool_index]); auto index = argmax_data[pool_index];
batch_grad_data[index] += out_grad_data[pool_index]; batch_grad_data[index] += out_grad_data[pool_index];
} }
} }
} }
batch_grad_data += channel_offset; batch_grad_data += in_stride[1];
out_grad_data += pool_channel_offset; out_grad_data += out_stride[1];
argmax_data += pool_channel_offset; argmax_data += argmax_stride[1];
} }
rois_data += roi_offset; rois_data += roi_stride[0];
} }
} }
} }
......
...@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase { ...@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
"SaveOp only support LoDTensor, %s has wrong type", iname); "SaveOp only support LoDTensor, %s has wrong type", iname);
auto &tensor = var->Get<framework::LoDTensor>(); auto &tensor = var->Get<framework::LoDTensor>();
framework::SerializeToStream(fout, tensor, dev_ctx);
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
framework::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
fout.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
fout.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
fout.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod();
uint64_t size = lod.size();
fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
for (auto &each : lod) {
size = each.size() * sizeof(framework::LoD::value_type::value_type);
fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
fout.write(reinterpret_cast<const char *>(each.data()),
static_cast<std::streamsize>(size));
}
}
} }
}; };
......
...@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, ...@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
ops::ScaleGradMaker); ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL(scale, REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>, ops::ScaleKernel<paddle::platform::CPUPlace, float>,
ops::ScaleKernel<paddle::platform::CPUPlace, double>); ops::ScaleKernel<paddle::platform::CPUPlace, double>,
ops::ScaleKernel<paddle::platform::CPUPlace, int>,
ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -16,4 +16,6 @@ ...@@ -16,4 +16,6 @@
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>, scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>); paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h"
namespace paddle {
namespace operators {
// TODO(typhoonzero): this is a simple implementation which only send
// one tensor
class SendOp : public framework::OperatorBase {
public:
SendOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
// init client when the operator is created at runtime.
if (!client_) {
std::string endpoint = Attr<std::string>("endpoint");
client_.reset(new detail::RPCClient(
grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
// TODO(typhoonzero): how to call InitVariables
}
}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto iname = Input("X");
auto oname = Output("Out");
// TODO(typhoonzero): currently it's non-blocking,
// should block until server responds.
bool ret = client_->SendVariable(scope, iname, oname);
if (!ret) {
LOG(ERROR) << "send variable error";
}
}
protected:
std::shared_ptr<detail::RPCClient> client_{nullptr};
};
class SendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) Input tensor to be saved");
AddOutput("Out", "(Tensor) Output fetched from server");
AddComment(R"DOC(
Recv operator
This operator will recv tensor from send_op
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// TODO(typhoonzero): add python bindings for this test as
// a RemoteOptimizer.
#include <unistd.h>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h"
USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(recv);
USE_OP(sum);
// global for simplicity.
std::unique_ptr<paddle::framework::OperatorBase> recv_op;
void InitTensorsInScope(paddle::framework::Scope &scope,
paddle::platform::CPUPlace &place) {
paddle::platform::CPUDeviceContext ctx(place);
auto var = scope.Var("X");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({10, 10});
float *expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<float>(i);
}
auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
out_tensor->Resize({10, 10});
tensor->mutable_data<float>(place); // allocate
}
void AddOp(const std::string &type,
const paddle::framework::VariableNameMap &inputs,
const paddle::framework::VariableNameMap &outputs,
paddle::framework::AttributeMap attrs,
paddle::framework::BlockDescBind *block) {
// insert output
for (auto kv : outputs) {
for (auto v : kv.second) {
auto var = block->Var(v);
var->SetDataType(paddle::framework::DataType::FP32);
}
}
// insert op
auto op = block->AppendOp();
op->SetType(type);
for (auto &kv : inputs) {
op->SetInput(kv.first, kv.second);
}
for (auto &kv : outputs) {
op->SetOutput(kv.first, kv.second);
}
op->SetAttrMap(attrs);
}
void StartServerNet() {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
InitTensorsInScope(scope, place);
// sub program run in recv_op, for simple test we use sum
paddle::framework::ProgramDescBind program;
paddle::framework::BlockDescBind *block = program.MutableBlock(0);
// X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
paddle::framework::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
attrs.insert({"OptimizeBlock", block});
recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
{{"Out", {"Out"}}}, attrs);
paddle::platform::CPUDeviceContext ctx(place);
recv_op->Run(scope, ctx);
}
TEST(SendRecvOp, CPU) {
std::thread server_thread(StartServerNet);
sleep(5); // wait server to start
// local net
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
InitTensorsInScope(scope, place);
paddle::framework::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
auto send_op = paddle::framework::OpRegistry::CreateOp(
"send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
paddle::platform::CPUDeviceContext ctx(place);
send_op->Run(scope, ctx);
auto in_var = scope.Var("X");
auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
float *expected = tensor->data<float>();
auto out_var = scope.Var("Out");
auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
// send fail cause output is none.
EXPECT_NE(target->memory_size(), size_t(0));
float *actual = target->data<float>();
for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]);
}
recv_op.reset(); // dtor can shutdown and join server thread.
server_thread.join();
}
...@@ -93,8 +93,7 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,8 +93,7 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor), " "(Tensor), "
"a vector<int> to describe the length of every input sequence for " "a vector<int> to describe the length of every input sequence for "
"sub sequence item."); "sub sequence item.");
AddOutput("Out", AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
"(LoDTensor), the output of SequenceSliceOp.");
AddComment(R"DOC( AddComment(R"DOC(
Sequence slice operator Sequence slice operator
......
...@@ -55,7 +55,7 @@ SGD operator ...@@ -55,7 +55,7 @@ SGD operator
This operator implements one step of the stochastic gradient descent algorithm. This operator implements one step of the stochastic gradient descent algorithm.
$$param_out = param - learning_rate * grad$$ $$param\_out = param - learning\_rate * grad$$
)DOC"); )DOC");
} }
......
...@@ -57,11 +57,21 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -57,11 +57,21 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto, ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", ""); AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
AddInput("RankTable", ""); AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
AddInput("I", ""); AddInput("I",
AddOutput("Out", ""); "(LoDTensor) The step index. The RNN step memory 'X' will be "
AddComment(""); "shrinked to match the size of the input of the index'th step.");
AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
AddComment(
R"DOC(
In dynamic RNN, we are able to handle sequences of different lengths.
Because of the multiple lengths, the size of each step input can be
different, which may lead to a mismatching between the input of
the current step and the memory generated by the previous one. This
operator shrinks memory according to the size of the next step input,
to make sure that they can match each other.
)DOC");
} }
}; };
......
...@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel { ...@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same."); PADDLE_ENFORCE_EQ(x_dims, y_dims);
PADDLE_ENFORCE_GE(x_dims.size(), 2, PADDLE_ENFORCE_GE(x_dims.size(), 2,
"The tensor rank of X must be at least 2."); "The tensor rank of Input(X) should not be less than 2.");
if (ctx->HasInput("InsideWeight")) { if (ctx->HasInput("InsideWeight")) {
PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"), PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
"If weights are provided, must specify both " "If weights are provided, must specify both "
"inside and outside weights."); "inside and outside weights.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims, PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
"The shape of InsideWeight must be same as X."); PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
"The shape of OutsideWeight must be same as X.");
} }
ctx->SetOutputDim("Diff", x_dims); ctx->SetOutputDim("Diff", x_dims);
...@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X",
"The input tensor of smooth l1 loss op." "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"The rank should be greater or equal to 2 with shape " "The input value of smooth l1 loss op with shape "
"[batch_size, value_dim1, value_dim2, ..., value_dimN]"); "[batch_size, dim1, ..., dimN].");
AddInput("Y", AddInput("Y",
"The target tensor of smooth l1 loss op " "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"with the same shape as X."); "The target value of smooth l1 loss op with same shape as X.");
AddInput("InsideWeight", AddInput("InsideWeight",
"Optional input tensor of smooth l1 loss op with the same shape " "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"as X. If provided, the result of (X - Y) will be multiplied " "This input is optional and should have same shape with X. "
"If provided, the result of (X - Y) will be multiplied "
"by this tensor element by element.") "by this tensor element by element.")
.AsDispensable(); .AsDispensable();
AddInput("OutsideWeight", AddInput("OutsideWeight",
"Optinal input of smooth l1 loss op with the same shape as X." "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"If provided, the output smooth l1 loss will be multiplied by " "This input is optional and should have same shape with X. "
"this tensor element by element.") "If provided, the out smooth l1 loss will be multiplied by this "
"tensor element by element.")
.AsDispensable(); .AsDispensable();
AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).") AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
.AsIntermediate(); .AsIntermediate();
AddOutput("Out", "Smooth l1 loss."); AddOutput("Out",
"(Tensor, default Tensor<float>) A tensor with rank be 2. "
"The output smooth l1 loss with shape [batch_size, 1].");
AddAttr<AttrType>("sigma", AddAttr<AttrType>("sigma",
"Hyper parameter of smooth l1 loss op." "Hyper parameter of smooth l1 loss op."
"A float scalar with default value 3.0.") "A float scalar with default value 3.0.")
...@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Smooth L1 Loss Operator. Smooth L1 Loss Operator.
This operator computes the smooth l1 loss for input and target. This operator computes the smooth l1 loss for X and Y.
The operator takes the first dimension of input as the batch size. The operator takes the first dimension of X and Y as batch size.
For each instance, it computes the smooth l1 loss element by element first For each instance, it computes the smooth l1 loss element by element first
and then sums all the losses. So the resulting output shape and then sums all the losses. So the shape of Out is [batch_size, 1].
is [batch_size, 1].
The equation is: The equation is:
loss = $$0.5 * (\sigma * (x-y))^2$$ if $$|x - y| < 1 /({\sigma}^2)$$ $$
$$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise Out_{\sigma}(X, Y)_i = \begin{cases}
0.5 * (\sigma * (X_i - Y_i)) ^ 2
\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
\quad otherwise
\end{cases}
$$
In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
element of Out, X and Y.
)DOC"); )DOC");
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/unpool_op.h"
namespace paddle {
namespace operators {
class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Unpool2dOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"(Tensor) The input tensor of unpool operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddInput(
"Indices",
"(Tensor) The input tensor of the indices given out by MaxPool2d. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddOutput("Out",
"(Tensor) The output tensor of unpool operator."
"The format of output tensor is also NCHW."
"Where N is batch size, C is "
"the number of channels, H and W is the height and "
"width of feature.");
AddAttr<std::vector<int>>(
"ksize",
"(vector), the unpooling window size(height, width) "
"of unpooling operator.");
AddAttr<std::vector<int>>("strides",
"(vector, default:{1, 1}), "
"strides (height, width) of unpooling operator.")
.SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings",
"(vector defalut:{0,0}), "
"paddings (height, width) of unpooling operator.")
.SetDefault({0, 0});
AddAttr<std::string>(
"unpooling_type",
"(string), unpooling type, can be \"max\" for max-unpooling ")
.InEnum({"max"});
AddComment(R"DOC(
"Input shape: $(N, C_{in}, H_{in}, W_{in})$
Output shape: $(N, C_{out}, H_{out}, W_{out})$
Where
$$
H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
$$
Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
/07/iccv2011.pdf
)DOC");
}
};
int OutputSize(int input_size, int ksize, int padding, int stride) {
int output_size = (input_size - 1) * stride - 2 * padding + ksize;
return output_size;
}
class UnpoolOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of UnpoolOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Indices"),
"Input(Indices) of UnpoolOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of UnpoolOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
auto in_y_dims = ctx->GetInputDim("Indices");
std::string unpooling_type =
ctx->Attrs().Get<std::string>("unpooling_type");
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE(in_x_dims.size() == 4,
"Unpooling intput must be of 4-dimensional.");
PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
for (size_t i = 0; i < ksize.size(); ++i) {
output_shape.push_back(
OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
}
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
};
class UnpoolOpGrad : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
ops::UnpoolOpGrad);
REGISTER_OP_CPU_KERNEL(unpool,
ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/unpool_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(unpool,
ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/unpooling.h"
namespace paddle {
namespace operators {
template <typename Place, typename T>
class UnpoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
auto* out = context.Output<framework::Tensor>("Out");
std::string unpooling_type = context.Attr<std::string>("unpooling_type");
std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
T* output_data = out->mutable_data<T>(context.GetPlace());
if (output_data) {
math::SetConstant<Place, T> set_zero;
set_zero(context.device_context(), out, static_cast<T>(0));
}
math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
}
};
template <typename Place, typename T>
class UnpoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
const framework::Tensor* out = context.Input<framework::Tensor>("Out");
const framework::Tensor* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor* in_x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
std::string unpooling_type = context.Attr<std::string>("unpooling_type");
std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
auto& device_ctx = context.device_context();
math::SetConstant<Place, T> zero;
if (in_x_grad) {
in_x_grad->mutable_data<T>(context.GetPlace());
zero(device_ctx, in_x_grad, static_cast<T>(0));
}
math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
*out_grad, in_x_grad);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace paddle {
namespace platform {
void CudaProfilerInit(std::string output_file, std::string output_mode,
std::vector<std::string> config_flags) {
std::array<char, 128> buf;
std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
memcpy(buf.data(), tmpl.data(), tmpl.size());
auto result = mktemp(buf.data());
PADDLE_ENFORCE(strlen(result) != 0);
std::string config_file = result;
{
std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
for (const auto& line : config_flags) {
ofs << line << std::endl;
}
}
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE(
cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
}
void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
} // namespace platform
} // namespace paddle
...@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); ...@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -37,6 +37,7 @@ limitations under the License. */ ...@@ -37,6 +37,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/operators/nccl/nccl_gpu_common.h" #include "paddle/operators/nccl/nccl_gpu_common.h"
#include "paddle/platform/cuda_profiler.h"
#include "paddle/platform/gpu_info.h" #include "paddle/platform/gpu_info.h"
#endif #endif
...@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("op_support_gpu", OpSupportGPU); m.def("op_support_gpu", OpSupportGPU);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
m.def("get_cuda_device_count", platform::GetCUDADeviceCount); m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
m.def("nvprof_init", platform::CudaProfilerInit);
m.def("nvprof_start", platform::CudaProfilerStart);
m.def("nvprof_stop", platform::CudaProfilerStop);
#endif #endif
return m.ptr(); return m.ptr();
......
...@@ -16,11 +16,13 @@ function cmake_gen() { ...@@ -16,11 +16,13 @@ function cmake_gen() {
echo "using python abi: $1" echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then if [ "$1" == "cp27-cp27m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
elif [ "$1" == "cp27-cp27mu" ]; then elif [ "$1" == "cp27-cp27mu" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
...@@ -181,6 +183,7 @@ EOF ...@@ -181,6 +183,7 @@ EOF
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/ ADD go/cmd/master/master /usr/bin/
ADD paddle/pybind/print_operators_doc /usr/bin/
# default command shows the paddle version and exit # default command shows the paddle version and exit
CMD ["paddle", "version"] CMD ["paddle", "version"]
EOF EOF
......
...@@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py ...@@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py
make -j `nproc` paddle_docs paddle_docs_cn make -j `nproc` paddle_docs paddle_docs_cn
# check websites for broken links # check websites for broken links
linkchecker doc/en/html/index.html # It will be failed now!
linkchecker doc/cn/html/index.html #linkchecker doc/en/html/index.html
#linkchecker doc/cn/html/index.html
# Parse Github URL # Parse Github URL
REPO=`git config remote.origin.url` REPO=`git config remote.origin.url`
......
...@@ -544,6 +544,9 @@ message LayerConfig { ...@@ -544,6 +544,9 @@ message LayerConfig {
// for batch normalization layer // for batch normalization layer
// The small constant added to the variance to improve numeric stability. // The small constant added to the variance to improve numeric stability.
optional double epsilon = 60 [ default = 0.00001 ]; optional double epsilon = 60 [ default = 0.00001 ];
// for factorization machine layer
optional uint32 factor_size = 61;
} }
message EvaluatorConfig { message EvaluatorConfig {
......
...@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase): ...@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase):
image_conf.img_size_y = input_layer.height image_conf.img_size_y = input_layer.height
image_conf.channels = input_layer.size / (input_layer.width * image_conf.channels = input_layer.size / (input_layer.width *
input_layer.height) input_layer.height)
# only support for 4-dims inputs and NCHW order
if (len(self.config.inputs) == 2):
self.set_layer_height_width(
self.get_input_layer(1).height, self.get_input_layer(1).width)
self.set_layer_size(self.get_input_layer(1).size)
else:
self.set_layer_height_width(shape[-2], shape[-1])
self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
@config_layer('batch_norm') @config_layer('batch_norm')
...@@ -2798,19 +2806,18 @@ class AddToLayer(LayerBase): ...@@ -2798,19 +2806,18 @@ class AddToLayer(LayerBase):
name, self.layer_type, 0, inputs=inputs, **xargs) name, self.layer_type, 0, inputs=inputs, **xargs)
config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer') config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
if len(self.inputs) > 1: layer_size = self.get_input_layer(0).size
# To reserve heght, width, depth.
layer_with_hwc = self.get_input_layer(0)
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
assert self.get_input_layer(0).height == self.get_input_layer( input_layer = self.get_input_layer(input_index)
input_index).height assert layer_size == input_layer.size
assert self.get_input_layer(0).width == self.get_input_layer( if input_layer.height and input_layer.height and input_layer.height:
input_index).width layer_with_hwc = input_layer
assert self.get_input_layer(0).depth == self.get_input_layer(
input_index).depth
self.set_layer_size(self.get_input_layer(0).size) self.set_layer_size(layer_with_hwc.size)
self.set_layer_height_width(self.get_input_layer(0).height, \ self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
self.get_input_layer(0).width) self.set_layer_depth(layer_with_hwc.depth)
self.set_layer_depth(self.get_input_layer(0).depth)
self.create_bias_parameter(bias, self.config.size) self.create_bias_parameter(bias, self.config.size)
...@@ -3850,6 +3857,26 @@ class SwitchOrderLayer(LayerBase): ...@@ -3850,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
name, 'switch_order', 0, inputs=inputs, **xargs) name, 'switch_order', 0, inputs=inputs, **xargs)
self.config.reshape_conf.height_axis.extend(reshape['height']) self.config.reshape_conf.height_axis.extend(reshape['height'])
self.config.reshape_conf.width_axis.extend(reshape['width']) self.config.reshape_conf.width_axis.extend(reshape['width'])
input_layer = self.get_input_layer(0)
if reshape is None:
self.set_layer_size(input_layer.size)
else:
in_h = input_layer.height
in_w = input_layer.width
out_dims = None
if input_layer.has_depth():
in_d = input_layer.depth
in_c = input_layer.size / in_h / in_w / in_d
# batch_size, depth, height, width, channel
out_dims = [0, in_d, in_h, in_w, in_c]
else:
in_c = input_layer.size / in_h / in_w
# batch_size, height, width, channel
out_dims = [0, in_h, in_w, in_c]
# Because (reshape['width'][0] > 0) always be true.
# So out_dims[0] won't be used.
size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
self.set_layer_size(size)
@config_layer('scale_sub_region') @config_layer('scale_sub_region')
...@@ -3871,6 +3898,21 @@ class ScaleSubRegionLayer(LayerBase): ...@@ -3871,6 +3898,21 @@ class ScaleSubRegionLayer(LayerBase):
image_conf.channels) image_conf.channels)
@config_layer('factorization_machine')
class FactorizationMachineLayer(LayerBase):
def __init__(self, name, inputs, factor_size, **xargs):
super(FactorizationMachineLayer, self).__init__(
name, 'factorization_machine', size=1, inputs=inputs, **xargs)
config_assert(
len(self.inputs) == 1,
'factorization machine layer must have one and only one input.')
self.config.factor_size = factor_size
input_layer = self.get_input_layer(0)
psize = input_layer.size * factor_size
dims = [input_layer.size, factor_size]
self.create_input_parameter(0, psize, dims)
# Deprecated, use a new layer specific class instead # Deprecated, use a new layer specific class instead
@config_func @config_func
def Layer(name, type, **xargs): def Layer(name, type, **xargs):
......
...@@ -148,6 +148,7 @@ __all__ = [ ...@@ -148,6 +148,7 @@ __all__ = [
'resize_layer', 'resize_layer',
'sub_seq_layer', 'sub_seq_layer',
'scale_sub_region_layer', 'scale_sub_region_layer',
'factorization_machine',
] ]
...@@ -264,6 +265,8 @@ class LayerType(object): ...@@ -264,6 +265,8 @@ class LayerType(object):
SCALE_SUB_REGION_LAYER = 'scale_sub_region' SCALE_SUB_REGION_LAYER = 'scale_sub_region'
FACTORIZATION_MACHINE = 'factorization_machine'
@staticmethod @staticmethod
def is_layer_type(type_name): def is_layer_type(type_name):
""" """
...@@ -1900,9 +1903,12 @@ def repeat_layer(input, ...@@ -1900,9 +1903,12 @@ def repeat_layer(input,
A layer for repeating the input for num_repeats times. A layer for repeating the input for num_repeats times.
If as_row_vector: If as_row_vector:
.. math:: .. math::
y = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n] y = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
If not as_row_vector: If not as_row_vector:
.. math:: .. math::
y = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n] y = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
...@@ -1915,19 +1921,19 @@ def repeat_layer(input, ...@@ -1915,19 +1921,19 @@ def repeat_layer(input,
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param num_repeats: Repeat the input so many times :param num_repeats: The times of repeating the input.
:type num_repeats: int :type num_repeats: int
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:param as_row_vector: True for treating input as row vector and repeating :type name: basestring
in the column direction. This is equivalent to apply :param as_row_vector: Whether to treat the input as row vectors or not. If
concat_layer() with num_repeats same input. the parameter is set to True, the repeating operation
False for treating input as column vector and repeating will be performed in the column direction. Otherwise,
in the row direction. it will be performed in the row direction.
:type as_row_vector: bool :type as_row_vector: bool
:param act: Activation type. IdentityActivation is the default activation. :param act: Activation type. IdentityActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:type name: basestring :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
:param layer_attr: extra layer attributes. details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -1974,13 +1980,14 @@ def seq_reshape_layer(input, ...@@ -1974,13 +1980,14 @@ def seq_reshape_layer(input,
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param reshape_size: the size of reshaped sequence. :param reshape_size: The dimension of the reshaped sequence.
:type reshape_size: int :type reshape_size: int
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param act: Activation type. IdentityActivation is the default activation. :param act: Activation type. IdentityActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:param bias_attr: The bias attribute. If the parameter is set to False or an object :param bias_attr: The bias attribute. If the parameter is set to False or an object
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
...@@ -2008,7 +2015,7 @@ def seq_reshape_layer(input, ...@@ -2008,7 +2015,7 @@ def seq_reshape_layer(input,
@layer_support() @layer_support()
def interpolation_layer(input, weight, name=None, layer_attr=None): def interpolation_layer(input, weight, name=None, layer_attr=None):
""" """
This layer is for linear interpolation with two inputs, This layer performs linear interpolation on two inputs,
which is used in NEURAL TURING MACHINE. which is used in NEURAL TURING MACHINE.
.. math:: .. math::
...@@ -2030,7 +2037,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None): ...@@ -2030,7 +2037,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
:type weight: LayerOutput :type weight: LayerOutput
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2064,7 +2072,7 @@ def bilinear_interp_layer(input, ...@@ -2064,7 +2072,7 @@ def bilinear_interp_layer(input,
name=None, name=None,
layer_attr=None): layer_attr=None):
""" """
This layer is to implement bilinear interpolation on conv layer output. This layer implements bilinear interpolation on convolutional layer's output.
Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
...@@ -2074,15 +2082,16 @@ def bilinear_interp_layer(input, ...@@ -2074,15 +2082,16 @@ def bilinear_interp_layer(input,
bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64) bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
:param input: A input layer. :param input: The input of this layer.
:type input: LayerOutput. :type input: LayerOutput.
:param out_size_x: bilinear interpolation output width. :param out_size_x: The width of the output.
:type out_size_x: int | None :type out_size_x: int
:param out_size_y: bilinear interpolation output height. :param out_size_y: The height of the output.
:type out_size_y: int | None :type out_size_y: int
:param name: The layer's name, which cna not be specified. :param name: The name of this layer. It is optional.
:type name: None | basestring :type name: basestring
:param layer_attr: Extra Layer attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2120,8 +2129,8 @@ def power_layer(input, weight, name=None, layer_attr=None): ...@@ -2120,8 +2129,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
.. math:: .. math::
y = x^w y = x^w
where :math:`x` is a input vector, :math:`w` is scalar weight, where :math:`x` is an input vector, :math:`w` is a scalar exponent,
and :math:`y` is a output vector. and :math:`y` is an output vector.
The example usage is: The example usage is:
...@@ -2131,11 +2140,12 @@ def power_layer(input, weight, name=None, layer_attr=None): ...@@ -2131,11 +2140,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param weight: Weight layer. :param weight: The exponent of the power.
:type weight: LayerOutput :type weight: LayerOutput
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2175,11 +2185,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None): ...@@ -2175,11 +2185,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param weight: Weight layer. :param weight: The weight of each sample.
:type weight: LayerOutput :type weight: LayerOutput
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2217,7 +2228,8 @@ def trans_layer(input, name=None, layer_attr=None): ...@@ -2217,7 +2228,8 @@ def trans_layer(input, name=None, layer_attr=None):
:type input: LayerOutput :type input: LayerOutput
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2253,11 +2265,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None): ...@@ -2253,11 +2265,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param height: The height of the sample matrix :param height: The height of the sample matrix.
:type height: int :type height: int
:param width: The width of the sample matrix.
:type width: int
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2302,15 +2317,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None): ...@@ -2302,15 +2317,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param a: input layer a :param a: The first input of this layer.
:type a: LayerOutput :type a: LayerOutput
:param b: input layer b :param b: The second input of this layer.
:type b: LayerOutput :type b: LayerOutput
:param scale: scale for cosine value. default is 5. :param scale: The scale of the cosine similarity. 1 is the default value.
:type scale: float :type scale: float
:param size: layer size. NOTE size_a * size should equal size_b. :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
:type size: int :type size: int
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2395,8 +2410,10 @@ def hsigmoid(input, ...@@ -2395,8 +2410,10 @@ def hsigmoid(input,
""" """
Organize the classes into a binary tree. At each node, a sigmoid function Organize the classes into a binary tree. At each node, a sigmoid function
is used to calculate the probability of belonging to the right branch. is used to calculate the probability of belonging to the right branch.
This idea is from "F. Morin, Y. Bengio (AISTATS 05):
Hierarchical Probabilistic Neural Network Language Model." Reference:
`Hierarchical Probabilistic Neural Network Language Model
<http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
The example usage is: The example usage is:
...@@ -2407,19 +2424,21 @@ def hsigmoid(input, ...@@ -2407,19 +2424,21 @@ def hsigmoid(input,
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput | list | tuple :type input: LayerOutput | list | tuple
:param label: Label layer. :param label: The input label.
:type label: LayerOutput :type label: LayerOutput
:param num_classes: number of classes. :param num_classes: The number of classes. And it should be larger than 2. If the parameter
:type num_classes: int | None is not set or set to None, its actual value will be automatically set to
the number of labels.
:type num_classes: int
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param bias_attr: The bias attribute. If the parameter is set to False or an object :param bias_attr: The bias attribute. If the parameter is set to False or an object
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
parameter is set to True, the bias is initialized to zero. parameter is set to True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param param_attr: Parameter Attribute. None means default parameter. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute | None :type param_attr: ParameterAttribute
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -2969,8 +2988,8 @@ def spp_layer(input, ...@@ -2969,8 +2988,8 @@ def spp_layer(input,
A layer performs spatial pyramid pooling. A layer performs spatial pyramid pooling.
Reference: Reference:
Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
https://arxiv.org/abs/1406.4729 https://arxiv.org/abs/1406.4729`_
The example usage is: The example usage is:
...@@ -3071,8 +3090,8 @@ def img_cmrnorm_layer(input, ...@@ -3071,8 +3090,8 @@ def img_cmrnorm_layer(input,
Response normalization across feature maps. Response normalization across feature maps.
Reference: Reference:
ImageNet Classification with Deep Convolutional Neural Networks `ImageNet Classification with Deep Convolutional Neural Networks
http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
The example usage is: The example usage is:
...@@ -3138,9 +3157,9 @@ def batch_norm_layer(input, ...@@ -3138,9 +3157,9 @@ def batch_norm_layer(input,
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
Reference: Reference:
Batch Normalization: Accelerating Deep Network Training by Reducing `Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift Internal Covariate Shift
http://arxiv.org/abs/1502.03167 http://arxiv.org/abs/1502.03167`_
The example usage is: The example usage is:
...@@ -4241,7 +4260,7 @@ def dot_prod_layer(input1, input2, name=None, layer_attr=None): ...@@ -4241,7 +4260,7 @@ def dot_prod_layer(input1, input2, name=None, layer_attr=None):
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input1: The first input layer. :param input1: The first input layer.
:type input: LayerOutput :type input1: LayerOutput
:param input2: The second input layer. :param input2: The second input layer.
:type input2: LayerOutput :type input2: LayerOutput
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
...@@ -5397,10 +5416,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): ...@@ -5397,10 +5416,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
to be devided by groups. to be devided by groups.
Reference: Reference:
Maxout Networks `Maxout Networks
http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
https://arxiv.org/pdf/1312.6082v4.pdf https://arxiv.org/pdf/1312.6082v4.pdf`_
.. math:: .. math::
y_{si+j} = \max_k x_{gsi + sk + j} y_{si+j} = \max_k x_{gsi + sk + j}
...@@ -5465,9 +5484,9 @@ def ctc_layer(input, ...@@ -5465,9 +5484,9 @@ def ctc_layer(input,
alignment between the inputs and the target labels is unknown. alignment between the inputs and the target labels is unknown.
Reference: Reference:
Connectionist Temporal Classification: Labelling Unsegmented Sequence Data `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
with Recurrent Neural Networks with Recurrent Neural Networks
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
Note: Note:
Considering the 'blank' label needed by CTC, you need to use (num_classes + 1) Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
...@@ -5539,9 +5558,9 @@ def warp_ctc_layer(input, ...@@ -5539,9 +5558,9 @@ def warp_ctc_layer(input,
install it to :code:`third_party/install/warpctc` directory. install it to :code:`third_party/install/warpctc` directory.
Reference: Reference:
Connectionist Temporal Classification: Labelling Unsegmented Sequence Data `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
with Recurrent Neural Networks with Recurrent Neural Networks
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
Note: Note:
- Let num_classes represents the category number. Considering the 'blank' - Let num_classes represents the category number. Considering the 'blank'
...@@ -5761,8 +5780,8 @@ def nce_layer(input, ...@@ -5761,8 +5780,8 @@ def nce_layer(input,
Noise-contrastive estimation. Noise-contrastive estimation.
Reference: Reference:
A fast and simple algorithm for training neural probabilistic language `A fast and simple algorithm for training neural probabilistic language
models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
The example usage is: The example usage is:
...@@ -5877,8 +5896,8 @@ def rank_cost(left, ...@@ -5877,8 +5896,8 @@ def rank_cost(left,
A cost Layer for learning to rank using gradient descent. A cost Layer for learning to rank using gradient descent.
Reference: Reference:
Learning to Rank using Gradient Descent `Learning to Rank using Gradient Descent
http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
.. math:: .. math::
...@@ -6413,8 +6432,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None): ...@@ -6413,8 +6432,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if} \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases} smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if} \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
Reference: Reference:
Fast R-CNN `Fast R-CNN
https://arxiv.org/pdf/1504.08083v2.pdf https://arxiv.org/pdf/1504.08083v2.pdf`_
The example usage is: The example usage is:
...@@ -6620,8 +6639,8 @@ def prelu_layer(input, ...@@ -6620,8 +6639,8 @@ def prelu_layer(input,
The Parametric Relu activation that actives outputs with a learnable weight. The Parametric Relu activation that actives outputs with a learnable weight.
Reference: Reference:
Delving Deep into Rectifiers: Surpassing Human-Level Performance on `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
.. math:: .. math::
z_i &\\quad if \\quad z_i > 0 \\\\ z_i &\\quad if \\quad z_i > 0 \\\\
...@@ -6717,8 +6736,8 @@ def gated_unit_layer(input, ...@@ -6717,8 +6736,8 @@ def gated_unit_layer(input,
product between :match:`X'` and :math:`\sigma` is finally returned. product between :match:`X'` and :math:`\sigma` is finally returned.
Reference: Reference:
Language Modeling with Gated Convolutional Networks `Language Modeling with Gated Convolutional Networks
https://arxiv.org/abs/1612.08083 https://arxiv.org/abs/1612.08083`_
.. math:: .. math::
y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c) y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
...@@ -6854,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): ...@@ -6854,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
:param input: The input of this layer. If two inputs are given, the second one :param input: The input of this layer. If two inputs are given, the second one
will be regarded as the reference. will be regarded as the reference.
And the input must be 4-dims and in NCHW order.
:type input: LayerOutput | Sequence :type input: LayerOutput | Sequence
:param offset: The crop offset. :param offset: The crop offset.
:type offset: Sequence :type offset: Sequence
...@@ -7387,3 +7407,73 @@ def scale_sub_region_layer(input, indices, value, name=None): ...@@ -7387,3 +7407,73 @@ def scale_sub_region_layer(input, indices, value, name=None):
parents=[input, indices], parents=[input, indices],
num_filters=input.num_filters, num_filters=input.num_filters,
size=input.size) size=input.size)
@wrap_name_default()
@wrap_act_default(act=LinearActivation())
@wrap_param_attr_default()
@layer_support()
def factorization_machine(input,
factor_size,
act=None,
name=None,
param_attr=None,
layer_attr=None):
"""
The Factorization Machine models pairwise feature interactions as inner
product of the learned latent vectors corresponding to each input feature.
The Factorization Machine can effectively capture feature interactions
especially when the input is sparse.
This implementation only consider the 2-order feature interactions using
Factorization Machine with the formula:
.. math::
y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
Note:
X is the input vector with size n. V is the factor matrix. Each row of V
is the latent vector corresponding to each input dimesion. The size of
each latent vector is k.
For details of Factorization Machine, please refer to the paper:
Factorization machines.
.. code-block:: python
first_order = paddle.layer.fc(input=input,
size=1,
act=paddle.activation.Linear())
second_order = paddle.layer.factorization_machine(input=input,
factor_size=10)
fm = paddle.layer.addto(input=[first_order, second_order],
act=paddle.activation.Linear(),
bias_attr=False)
:param input: The input layer. Supported input types: all input data types
on CPU, and only dense input types on GPU.
:type input: LayerOutput
:param factor_size: The hyperparameter that defines the dimensionality of
the latent vector size.
:type context_len: int
:param act: Activation Type. Default is linear activation.
:type act: BaseActivation
:param param_attr: The parameter attribute. See ParameterAttribute for
details.
:type param_attr: ParameterAttribute
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(input, LayerOutput)
assert factor_size > 0, "the factor_size must be greater than 0."
Layer(
inputs=[Input(input.name, **param_attr.attr)],
name=name,
factor_size=factor_size,
type=LayerType.FACTORIZATION_MACHINE,
active_type=act.name,
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
...@@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l ...@@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer) test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
test_factorization_machine)
export whole_configs=(test_split_datasource) export whole_configs=(test_split_datasource)
type: "nn"
layers {
name: "data"
type: "data"
size: 1024
active_type: ""
}
layers {
name: "__factorization_machine_0__"
type: "factorization_machine"
size: 1
active_type: ""
inputs {
input_layer_name: "data"
input_parameter_name: "___factorization_machine_0__.w0"
}
factor_size: 10
}
parameters {
name: "___factorization_machine_0__.w0"
size: 10240
initial_mean: 0.0
initial_std: 0.03125
dims: 1024
dims: 10
initial_strategy: 0
initial_smart: true
}
input_layer_names: "data"
output_layer_names: "__factorization_machine_0__"
sub_models {
name: "root"
layer_names: "data"
layer_names: "__factorization_machine_0__"
input_layer_names: "data"
output_layer_names: "__factorization_machine_0__"
is_recurrent_layer_group: false
}
from paddle.trainer_config_helpers import *
data = data_layer(name='data', size=1024)
fm = factorization_machine(input=data, factor_size=10)
outputs(fm)
...@@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count): ...@@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count):
'''Get the number of physical cores''' '''Get the number of physical cores'''
if platform.system() == "Linux": if platform.system() == "Linux":
num_sockets = int( num_sockets = int(
os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs") os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
.read()) .read())
num_cores_per_socket = int( num_cores_per_socket = int(
os.popen( os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
"lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs")
.read()) .read())
return num_sockets * num_cores_per_socket return num_sockets * num_cores_per_socket
else: else:
......
...@@ -38,6 +38,7 @@ UCI_TEST_DATA = None ...@@ -38,6 +38,7 @@ UCI_TEST_DATA = None
URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar' URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b' MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
def feature_range(maximums, minimums): def feature_range(maximums, minimums):
import matplotlib import matplotlib
matplotlib.use('Agg') matplotlib.use('Agg')
...@@ -114,7 +115,8 @@ def test(): ...@@ -114,7 +115,8 @@ def test():
def model(): def model():
tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL) tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
MD5_MODEL)
with open(tar_file, 'r') as f: with open(tar_file, 'r') as f:
parameters = Parameters.from_tar(f) parameters = Parameters.from_tar(f)
return parameters return parameters
......
...@@ -13,13 +13,14 @@ import nets ...@@ -13,13 +13,14 @@ import nets
import optimizer import optimizer
import backward import backward
import regularizer import regularizer
from param_attr import ParamAttr
from core import LoDTensor, CPUPlace, GPUPlace from core import LoDTensor, CPUPlace, GPUPlace
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor' 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
] ]
...@@ -35,7 +36,8 @@ def __read_gflags_from_env__(): ...@@ -35,7 +36,8 @@ def __read_gflags_from_env__():
read_env_flags = ['use_pinned_memory'] read_env_flags = ['use_pinned_memory']
if core.is_compile_gpu(): if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use') read_env_flags.append('fraction_of_gpu_memory_to_use')
core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
__read_gflags_from_env__() __read_gflags_from_env__()
...@@ -26,9 +26,9 @@ class Evaluator(object): ...@@ -26,9 +26,9 @@ class Evaluator(object):
name(str): The name of evaluator. such as, "accuracy". Used for generate name(str): The name of evaluator. such as, "accuracy". Used for generate
temporary variable name. temporary variable name.
main_program(Program, optional): The evaluator should be added to this main_program(Program, optional): The evaluator should be added to this
main_program. Default g_main_program main_program. Default default_main_program()
startup_program(Program, optional):The parameter should be added to this startup_program(Program, optional):The parameter should be added to this
startup_program. Default g_startup_program startup_program. Default default_startup_program()
Attributes: Attributes:
states(list): The list of state variables. states will be reset to zero states(list): The list of state variables. states will be reset to zero
......
import numpy as np import numpy as np
from . import core from . import core
from framework import Program, g_main_program from framework import Program, default_main_program
__all__ = ['Executor', 'g_scope'] __all__ = ['Executor', 'g_scope']
...@@ -103,7 +103,7 @@ class Executor(object): ...@@ -103,7 +103,7 @@ class Executor(object):
fetch_list = [] fetch_list = []
if program is None: if program is None:
program = g_main_program program = default_main_program()
if not isinstance(program, Program): if not isinstance(program, Program):
raise TypeError() raise TypeError()
......
...@@ -3,10 +3,12 @@ import collections ...@@ -3,10 +3,12 @@ import collections
import numpy as np import numpy as np
from . import core from . import core
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
import contextlib
__all__ = [ __all__ = [
'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
'default_main_program', 'g_startup_program', 'g_main_program' 'default_main_program', 'program_guard', 'switch_startup_program',
'switch_main_program'
] ]
...@@ -395,7 +397,11 @@ class Block(object): ...@@ -395,7 +397,11 @@ class Block(object):
return v return v
def all_parameters(self): def all_parameters(self):
return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)} return list(self.iter_parameters())
def iter_parameters(self):
return (item[1] for item in self.vars.iteritems()
if isinstance(item[1], Parameter))
def create_var(self, *args, **kwargs): def create_var(self, *args, **kwargs):
var = Variable(self, *args, **kwargs) var = Variable(self, *args, **kwargs)
...@@ -469,6 +475,37 @@ class Block(object): ...@@ -469,6 +475,37 @@ class Block(object):
for index in range(len(self.ops)): for index in range(len(self.ops)):
assert self.ops[index].desc == ops_in_cpp[index] assert self.ops[index].desc == ops_in_cpp[index]
def copy_param_info_from(self, other):
"""
Copy the information of parameters from other block
Args:
other(Block): other block
Returns:
None
"""
if not isinstance(other, Block):
raise TypeError("copy_param_info_from should be invoked with Block")
for p in other.iter_parameters():
assert isinstance(p, Parameter)
v = self.vars.get(p.name, None)
if v is None:
raise ValueError("copy_param_info_from should be invoked with "
"same topology")
assert isinstance(v, Variable)
new_p = Parameter(
block=self,
shape=v.shape,
dtype=v.dtype,
type=v.type,
lod_level=v.lod_level,
stop_gradient=p.stop_gradient,
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
name=v.name)
self.vars[new_p.name] = new_p
class Program(object): class Program(object):
def __init__(self): def __init__(self):
...@@ -489,6 +526,7 @@ class Program(object): ...@@ -489,6 +526,7 @@ class Program(object):
p.desc = core.ProgramDesc(self.desc) p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p.sync_with_cpp() p.sync_with_cpp()
p.copy_param_info_from(self)
return p return p
def prune(self, targets): def prune(self, targets):
...@@ -572,6 +610,24 @@ class Program(object): ...@@ -572,6 +610,24 @@ class Program(object):
for block in self.blocks: for block in self.blocks:
block.sync_with_cpp() block.sync_with_cpp()
def copy_param_info_from(self, other):
"""
Copy the information of parameters from other program.
Args:
other(Program): Other program
Returns:
None
"""
if not isinstance(other, Program):
raise TypeError("copy_param_info_from should be invoked with "
"Program")
if len(self.blocks) != len(other.blocks):
raise ValueError("copy_param_info_from should be invoked with two "
"program, with represent the same topology")
self.global_block().copy_param_info_from(other.global_block())
def list_vars(self): def list_vars(self):
for each_block in self.blocks: for each_block in self.blocks:
for each_var in each_block.vars.itervalues(): for each_var in each_block.vars.itervalues():
...@@ -600,13 +656,88 @@ class Parameter(Variable): ...@@ -600,13 +656,88 @@ class Parameter(Variable):
# program is a global instance. # program is a global instance.
g_main_program = Program() _main_program_ = Program()
g_startup_program = Program() _startup_program_ = Program()
def default_startup_program(): def default_startup_program():
return g_startup_program """
Get default startup program. In startup program, Paddle will initialize
parameters, initialize nccl handle, etc.
Returns:
Program: startup program
"""
return _startup_program_
def default_main_program(): def default_main_program():
return g_main_program """
Get default main program. The main program is used for training or testing.
Returns:
Program: main program
"""
return _main_program_
def switch_main_program(program):
"""
Switch the main program to a new program.
Args:
program(Program): The new main program
Returns:
Program: The previous main program
"""
global _main_program_
prev_program = _main_program_
_main_program_ = program
return prev_program
def switch_startup_program(program):
"""
Switch the startup program to a new program
Args:
program(Program): The new startup program
Returns:
Program: The previous startup program
"""
global _startup_program_
prev_program = _startup_program_
_startup_program_ = program
return prev_program
@contextlib.contextmanager
def program_guard(main_program, startup_program=None):
"""
Switch program with `with` statement
Examples:
>>> with program_guard(Program()):
>>> data = fluid.layers.data(...)
>>> hidden = fluid.layers.fc(...)
Args:
main_program(Program): New main program inside `with` statement
startup_program(Program): New startup program inside `with` statement.
None means do not change startup program.
Returns:
None
"""
if not isinstance(main_program, Program):
raise TypeError("main_program should be Program")
main_program = switch_main_program(main_program)
if startup_program is not None:
if not isinstance(startup_program, Program):
raise TypeError("startup_program should be Program")
startup_program = switch_startup_program(startup_program)
yield
switch_main_program(main_program)
if startup_program is not None:
switch_startup_program(startup_program)
import os import os
import cPickle as pickle import cPickle as pickle
from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \ from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
Variable
__all__ = [ __all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
...@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
""" """
if vars is None: if vars is None:
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(main_program, Program): if not isinstance(main_program, Program):
raise TypeError("program should be as Program type or None") raise TypeError("program should be as Program type or None")
...@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
:param executor: executor that save variable :param executor: executor that save variable
:param dirname: directory path :param dirname: directory path
:param main_program: program. If vars is None, then filter all variables in this :param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default g_program. program which fit `predicate`. Default default_main_program().
:param predicate: The Predicate describes a callable that returns a variable :param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be loaded. as a bool. If it returns true, the variables will be loaded.
:param vars: variables need to be loaded. If specify vars, program & :param vars: variables need to be loaded. If specify vars, program &
...@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
""" """
if vars is None: if vars is None:
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(main_program, Program): if not isinstance(main_program, Program):
raise TypeError("program's type should be Program") raise TypeError("program's type should be Program")
...@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None): ...@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
def get_inference_program(target_vars, main_program=None): def get_inference_program(target_vars, main_program=None):
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(target_vars, list): if not isinstance(target_vars, list):
target_vars = [target_vars] target_vars = [target_vars]
...@@ -177,12 +176,12 @@ def save_inference_model(dirname, ...@@ -177,12 +176,12 @@ def save_inference_model(dirname,
:param target_vars: Variables from which we can get inference results. :param target_vars: Variables from which we can get inference results.
:param executor: executor that save inference model :param executor: executor that save inference model
:param main_program: original program, which will be pruned to build the inference model. :param main_program: original program, which will be pruned to build the inference model.
Default g_main_program. Default default_main_program().
:return: None :return: None
""" """
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(target_vars, list): if not isinstance(target_vars, list):
target_vars = [target_vars] target_vars = [target_vars]
...@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
:param executor: executor for retrieving the value :param executor: executor for retrieving the value
:param name: the name of the parameter :param name: the name of the parameter
:param program: the program where the variable is found :param program: the program where the variable is found
Default g_main_program. Default default_main_program().
:return: the LoDTensor for the variable :return: the LoDTensor for the variable
""" """
if program is None: if program is None:
program = g_main_program program = default_main_program()
var = program.global_block().var(name) var = program.global_block().var(name)
return get_parameter_value(var, executor) return get_parameter_value(var, executor)
import copy import copy
import itertools import itertools
from framework import Variable, g_main_program, \ from framework import Variable, default_main_program, default_startup_program, \
g_startup_program, unique_name, dtype_is_floating unique_name, dtype_is_floating
from paddle.v2.fluid.initializer import Constant, Xavier from paddle.v2.fluid.initializer import Constant, Xavier
from param_attr import ParamAttr
class LayerHelper(object): class LayerHelper(object):
...@@ -22,7 +23,7 @@ class LayerHelper(object): ...@@ -22,7 +23,7 @@ class LayerHelper(object):
def main_program(self): def main_program(self):
prog = self.kwargs.get('main_program', None) prog = self.kwargs.get('main_program', None)
if prog is None: if prog is None:
return g_main_program return default_main_program()
else: else:
return prog return prog
...@@ -30,7 +31,7 @@ class LayerHelper(object): ...@@ -30,7 +31,7 @@ class LayerHelper(object):
def startup_program(self): def startup_program(self):
prog = self.kwargs.get('startup_program', None) prog = self.kwargs.get('startup_program', None)
if prog is None: if prog is None:
return g_startup_program return default_startup_program()
else: else:
return prog return prog
...@@ -60,31 +61,15 @@ class LayerHelper(object): ...@@ -60,31 +61,15 @@ class LayerHelper(object):
@property @property
def param_attr(self): def param_attr(self):
default = {'name': None} return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
actual = self.kwargs.get('param_attr', None)
if actual is None:
actual = default
for default_field in default.keys():
if default_field not in actual:
actual[default_field] = default[default_field]
return actual
@property @property
def bias_attr(self): def bias_attr(self):
default = {'name': None} return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
bias_attr = self.kwargs.get('bias_attr', None)
if bias_attr is None:
bias_attr = default
if isinstance(bias_attr, dict):
for default_field in default.keys():
if default_field not in bias_attr:
bias_attr[default_field] = default[default_field]
return bias_attr
def multiple_param_attr(self, length): def multiple_param_attr(self, length):
param_attr = self.param_attr param_attr = self.param_attr
if isinstance(param_attr, dict): if isinstance(param_attr, ParamAttr):
param_attr = [param_attr] param_attr = [param_attr]
if len(param_attr) != 1 and len(param_attr) != length: if len(param_attr) != 1 and len(param_attr) != length:
...@@ -112,23 +97,30 @@ class LayerHelper(object): ...@@ -112,23 +97,30 @@ class LayerHelper(object):
raise ValueError("Data Type mismatch") raise ValueError("Data Type mismatch")
return dtype return dtype
def create_parameter(self, attr, shape, dtype, suffix='w', def create_parameter(self,
initializer=None): attr,
shape,
dtype,
is_bias=False,
default_initializer=None):
# Deepcopy the attr so that parameters can be shared in program # Deepcopy the attr so that parameters can be shared in program
attr_copy = copy.deepcopy(attr) assert isinstance(attr, ParamAttr)
if initializer is not None: suffix = 'b' if is_bias else 'w'
attr_copy['initializer'] = initializer
if default_initializer is None:
if is_bias:
attr.set_default_bias_initializer()
else:
attr.set_default_param_initializer()
else: else:
attr_copy['initializer'] = self._get_default_initializer(dtype) attr.set_default_initializer(default_initializer)
if attr_copy['name'] is None: if attr.name is None:
attr_copy['name'] = unique_name(".".join([self.name, suffix])) attr.name = unique_name(".".join([self.name, suffix]))
self.startup_program.global_block().create_parameter( self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr_copy) dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter( return self.main_program.global_block().create_parameter(
name=attr_copy['name'], dtype=dtype, shape=shape, **attr.to_kwargs())
dtype=dtype,
shape=shape,
trainable=attr_copy.get('trainable', True))
def create_tmp_variable(self, dtype): def create_tmp_variable(self, dtype):
return self.main_program.current_block().create_var( return self.main_program.current_block().create_var(
...@@ -153,11 +145,7 @@ class LayerHelper(object): ...@@ -153,11 +145,7 @@ class LayerHelper(object):
persistable=True, persistable=True,
initializer=initializer) initializer=initializer)
def append_bias_op(self, def append_bias_op(self, input_var, dim_start=1, dim_end=None):
input_var,
bias_initializer,
dim_start=1,
dim_end=None):
""" """
Append bias operator and return its output. If the user does not set Append bias operator and return its output. If the user does not set
bias_attr, append_bias_op will return input_var bias_attr, append_bias_op will return input_var
...@@ -177,11 +165,7 @@ class LayerHelper(object): ...@@ -177,11 +165,7 @@ class LayerHelper(object):
return input_var return input_var
b = self.create_parameter( b = self.create_parameter(
attr=bias_attr, attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
shape=size,
dtype=input_var.dtype,
suffix='b',
initializer=bias_initializer)
tmp = self.create_tmp_variable(dtype=input_var.dtype) tmp = self.create_tmp_variable(dtype=input_var.dtype)
self.append_op( self.append_op(
type='elementwise_add', type='elementwise_add',
......
from . import core import core
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
from framework import OpProtoHolder, Variable, Program, Operator from framework import OpProtoHolder, Variable, Program, Operator
from initializer import Constant, Normal, Xavier from initializer import Constant, Normal, Xavier, Initializer
from paddle.v2.fluid.layer_helper import LayerHelper, unique_name from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
import re import re
import cStringIO import cStringIO
from param_attr import ParamAttr
__all__ = [ __all__ = [
'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
...@@ -17,9 +18,7 @@ def fc(input, ...@@ -17,9 +18,7 @@ def fc(input,
size, size,
num_flatten_dims=1, num_flatten_dims=1,
param_attr=None, param_attr=None,
param_initializer=None,
bias_attr=None, bias_attr=None,
bias_initializer=None,
act=None, act=None,
name=None, name=None,
main_program=None, main_program=None,
...@@ -54,23 +53,10 @@ def fc(input, ...@@ -54,23 +53,10 @@ def fc(input,
to the LayerHelper constructor. to the LayerHelper constructor.
""" """
def _get_default_param_initializer():
return Xavier()
def _get_default_bias_initializer():
return Constant()
helper = LayerHelper('fc', **locals()) helper = LayerHelper('fc', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
mul_results = [] mul_results = []
for input_var, param_attr in helper.iter_inputs_and_params(): for input_var, param_attr in helper.iter_inputs_and_params():
input_shape = input_var.shape input_shape = input_var.shape
...@@ -78,10 +64,7 @@ def fc(input, ...@@ -78,10 +64,7 @@ def fc(input,
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size] ] + [size]
w = helper.create_parameter( w = helper.create_parameter(
attr=param_attr, attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
initializer=param_initializer,
shape=param_shape,
dtype=dtype)
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type="mul", type="mul",
...@@ -102,7 +85,7 @@ def fc(input, ...@@ -102,7 +85,7 @@ def fc(input,
helper.append_op( helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
# add bias # add bias
pre_activation = helper.append_bias_op(pre_bias, bias_initializer) pre_activation = helper.append_bias_op(pre_bias)
# add activation # add activation
return helper.append_activation(pre_activation) return helper.append_activation(pre_activation)
...@@ -110,7 +93,6 @@ def fc(input, ...@@ -110,7 +93,6 @@ def fc(input,
def embedding(input, def embedding(input,
size, size,
is_sparse=False, is_sparse=False,
param_initializer=None,
param_attr=None, param_attr=None,
dtype='float32', dtype='float32',
main_program=None, main_program=None,
...@@ -119,6 +101,7 @@ def embedding(input, ...@@ -119,6 +101,7 @@ def embedding(input,
Embedding Layer. Embedding Layer.
Args: Args:
param_initializer:
input: The input to the function input: The input to the function
size: The size of the layer size: The size of the layer
is_sparse: A flag that decleares whether the input is sparse is_sparse: A flag that decleares whether the input is sparse
...@@ -136,15 +119,9 @@ def embedding(input, ...@@ -136,15 +119,9 @@ def embedding(input,
""" """
def _get_default_param_initializer():
return Xavier()
helper = LayerHelper('embedding', **locals()) helper = LayerHelper('embedding', **locals())
w = helper.create_parameter( w = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
shape=size,
dtype=dtype,
initializer=param_initializer or _get_default_param_initializer())
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type='lookup_table', type='lookup_table',
...@@ -176,7 +153,7 @@ def dynamic_lstm(input, ...@@ -176,7 +153,7 @@ def dynamic_lstm(input,
if not use_peepholes: if not use_peepholes:
bias_size[1] = 4 * size bias_size[1] = 4 * size
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b') attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
hidden = helper.create_tmp_variable(dtype) hidden = helper.create_tmp_variable(dtype)
cell = helper.create_tmp_variable(dtype) cell = helper.create_tmp_variable(dtype)
...@@ -471,19 +448,14 @@ def sums(input, out=None, main_program=None, startup_program=None): ...@@ -471,19 +448,14 @@ def sums(input, out=None, main_program=None, startup_program=None):
def linear_chain_crf(input, def linear_chain_crf(input,
label, label,
param_attr=None, param_attr=None,
param_initializer=None,
main_program=None, main_program=None,
startup_program=None): startup_program=None):
def _get_default_param_initializer():
return Xavier()
helper = LayerHelper('linear_chain_crf', **locals()) helper = LayerHelper('linear_chain_crf', **locals())
size = input.shape[1] size = input.shape[1]
transition = helper.create_parameter( transition = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=[size + 2, size], shape=[size + 2, size],
dtype=helper.input_dtype(), dtype=helper.input_dtype())
initializer=param_initializer or _get_default_param_initializer())
alpha = helper.create_tmp_variable(dtype=helper.input_dtype()) alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
...@@ -646,9 +618,7 @@ def sequence_conv(input, ...@@ -646,9 +618,7 @@ def sequence_conv(input,
filter_stride=1, filter_stride=1,
padding=None, padding=None,
bias_attr=None, bias_attr=None,
bias_initializer=None,
param_attr=None, param_attr=None,
param_initializer=None,
act=None, act=None,
main_program=None, main_program=None,
startup_program=None): startup_program=None):
...@@ -658,30 +628,15 @@ def sequence_conv(input, ...@@ -658,30 +628,15 @@ def sequence_conv(input,
in the input parameters to the function. in the input parameters to the function.
""" """
def _get_default_bias_initializer():
return Constant()
def _get_default_param_initializer():
return Xavier()
# FIXME(dzh) : want to unify the argument of python layer # FIXME(dzh) : want to unify the argument of python layer
# function. So we ignore some unecessary attributes. # function. So we ignore some unecessary attributes.
# such as, padding_trainable, context_start. # such as, padding_trainable, context_start.
helper = LayerHelper('sequence_conv', **locals()) helper = LayerHelper('sequence_conv', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
filter_shape = [filter_size * input.shape[1], num_filters] filter_shape = [filter_size * input.shape[1], num_filters]
filter = helper.create_parameter( filter = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr, shape=filter_shape, dtype=dtype)
shape=filter_shape,
dtype=dtype,
initializer=param_initializer)
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
...@@ -696,7 +651,7 @@ def sequence_conv(input, ...@@ -696,7 +651,7 @@ def sequence_conv(input,
'contextStart': -int(filter_size / 2), 'contextStart': -int(filter_size / 2),
'contextLength': filter_size 'contextLength': filter_size
}) })
pre_act = helper.append_bias_op(pre_bias, bias_initializer) pre_act = helper.append_bias_op(pre_bias)
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
...@@ -707,9 +662,7 @@ def conv2d(input, ...@@ -707,9 +662,7 @@ def conv2d(input,
padding=None, padding=None,
groups=None, groups=None,
param_attr=None, param_attr=None,
param_initializer=None,
bias_attr=None, bias_attr=None,
bias_initializer=None,
act=None, act=None,
name=None, name=None,
main_program=None, main_program=None,
...@@ -722,13 +675,6 @@ def conv2d(input, ...@@ -722,13 +675,6 @@ def conv2d(input,
conv-2d output, if mentioned in the input parameters. conv-2d output, if mentioned in the input parameters.
""" """
def _get_default_bias_initializer():
return Constant()
def _get_default_param_initializer(filter_size, num_channels):
std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
return Normal(0.0, std, 0)
helper = LayerHelper('conv2d', **locals()) helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -750,17 +696,16 @@ def conv2d(input, ...@@ -750,17 +696,16 @@ def conv2d(input,
input_shape = input.shape input_shape = input.shape
filter_shape = [num_filters, num_filter_channels] + filter_size filter_shape = [num_filters, num_filter_channels] + filter_size
if param_initializer is None: def _get_default_param_initializer():
param_initializer = _get_default_param_initializer(filter_size, std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
num_channels) return Normal(0.0, std, 0)
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
filter = helper.create_parameter( filter = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=filter_shape, shape=filter_shape,
dtype=dtype, dtype=dtype,
initializer=param_initializer) default_initializer=_get_default_param_initializer())
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
...@@ -774,8 +719,7 @@ def conv2d(input, ...@@ -774,8 +719,7 @@ def conv2d(input,
'paddings': padding, 'paddings': padding,
'groups': groups}) 'groups': groups})
pre_act = helper.append_bias_op( pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
pre_bias, bias_initializer, dim_start=1, dim_end=2)
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
...@@ -876,12 +820,10 @@ def batch_norm(input, ...@@ -876,12 +820,10 @@ def batch_norm(input,
attr=helper.param_attr, attr=helper.param_attr,
shape=param_shape, shape=param_shape,
dtype=dtype, dtype=dtype,
initializer=Constant(1.0)) default_initializer=Constant(1.0))
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
shape=param_shape,
dtype=dtype,
initializer=Constant(0.0))
mean = helper.create_global_variable( mean = helper.create_global_variable(
dtype=input.dtype, shape=param_shape, persistable=True) dtype=input.dtype, shape=param_shape, persistable=True)
...@@ -1587,6 +1529,93 @@ def array_length(array, main_program=None): ...@@ -1587,6 +1529,93 @@ def array_length(array, main_program=None):
return tmp return tmp
def conv2d_transpose(input,
num_filters,
output_size=None,
filter_size=None,
padding=None,
stride=None,
param_attr=None,
main_program=None,
startup_program=None):
"""
The transpose of conv2d layer.
This layer is also known as deconvolution layer.
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
output_size(int|tuple|None): The output image size. If output size is a
tuple, it must contain two integers, (image_H, image_W). This
parameter only works when filter_size is None.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square. None if use output size to
calculate filter_size
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride.
param_attr: Parameter Attribute.
main_program(Program): the main program
startup_program(Program): the startup program
Returns:
Variable: Output image.
"""
helper = LayerHelper("conv2d_transpose", **locals())
if not isinstance(input, Variable):
raise TypeError("Input of conv2d_transpose must be Variable")
input_channel = input.shape[1]
op_attr = dict()
if isinstance(padding, int):
op_attr['paddings'] = [padding, padding]
elif padding is not None:
op_attr['paddings'] = padding
if isinstance(stride, int):
op_attr['strides'] = stride
elif stride is not None:
op_attr['strides'] = stride
if filter_size is None:
if output_size is None:
raise ValueError("output_size must be set when filter_size is None")
if isinstance(output_size, int):
output_size = [output_size, output_size]
padding = op_attr.get('paddings', [0, 0])
stride = op_attr.get('strides', [1, 1])
h_in = input.shape[2]
w_in = input.shape[3]
filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
filter_size = [filter_size_h, filter_size_w]
elif isinstance(filter_size, int):
filter_size = [filter_size, filter_size]
filter_shape = [input_channel, num_filters] + filter_size
img_filter = helper.create_parameter(
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
out = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
type='conv2d_transpose',
inputs={'Input': [input],
'Filter': [img_filter]},
outputs={'Output': out},
attrs=op_attr)
return out
class ConditionalBlockGuard(BlockGuard): class ConditionalBlockGuard(BlockGuard):
def __init__(self, block): def __init__(self, block):
if not isinstance(block, ConditionalBlock): if not isinstance(block, ConditionalBlock):
......
from initializer import Initializer, Xavier, Constant
from regularizer import WeightDecayRegularizer
class ParamAttr(object):
def __init__(self,
name=None,
initializer=None,
learning_rate=1.0,
regularizer=None,
trainable=True):
self.name = name
self.initializer = initializer
self.learning_rate = learning_rate
self.regularizer = regularizer
self.trainable = trainable
def set_default_initializer(self, initializer):
if initializer is None:
if self.initializer is None:
raise ValueError("ParamAttr.initializer is not set")
return
if self.initializer is not None:
return
self.initializer = initializer
def set_default_param_initializer(self):
self.set_default_initializer(Xavier())
def set_default_bias_initializer(self):
self.set_default_initializer(Constant(0.0))
@staticmethod
def to_attr(arg):
if arg is None:
return ParamAttr()
elif isinstance(arg, ParamAttr):
return arg
elif isinstance(arg, str) or isinstance(arg, unicode):
return ParamAttr(name=arg)
elif isinstance(arg, Initializer):
return ParamAttr(initializer=arg)
elif isinstance(arg, WeightDecayRegularizer):
return ParamAttr(regularizer=arg)
elif isinstance(arg, bool):
return ParamAttr.to_attr(None) if arg else False
else:
raise TypeError("{0} cast to ParamAttr".format(type(arg)))
def to_kwargs(self, with_initializer=False):
kwargs = {
'name': self.name,
'learning_rate': self.learning_rate,
'regularizer': self.regularizer,
'trainable': self.trainable
}
if with_initializer:
kwargs['initializer'] = self.initializer
return kwargs
import paddle.v2.fluid.core as core
from contextlib import contextmanager
__all__ = ['CudaProfiler']
NVPROF_CONFIG = [
"gpustarttimestamp",
"gpuendtimestamp",
"gridsize3d",
"threadblocksize",
"streamid",
"enableonstart 0",
"conckerneltrace",
]
@contextmanager
def cuda_profiler(output_file, output_mode=None, config=None):
"""The CUDA profiler.
This fuctions is used to profile CUDA program by CUDA runtime application
programming interface. The profiling result will be written into
`output_file` with Key-Value pair format or Comma separated values format.
The user can set the output mode by `output_mode` argument and set the
counters/options for profiling by `config` argument. The default config
is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
Args:
output_file (string) : The output file name, the result will be
written into this file.
output_mode (string) : The output mode has Key-Value pair format and
Comma separated values format. It should be 'kvp' or 'csv'.
config (string) : The profiler options and counters can refer to
"Compute Command Line Profiler User Guide".
"""
if output_mode is None:
output_mode = 'csv'
if output_mode not in ['kvp', 'csv']:
raise ValueError("The output mode must be 'kvp' or 'csv'.")
config = NVPROF_CONFIG if config is None else config
core.nvprof_init(output_file, output_mode, config)
# Enables profiler collection by the active CUDA profiling tool.
core.nvprof_start()
yield
# Disables profiler collection.
core.nvprof_stop()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
list(REMOVE_ITEM TEST_OPS test_image_classification_train)
py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
# default test
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
endforeach() endforeach()
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import sys
def resnet_cifar10(input, depth=32): def resnet_cifar10(input, depth=32):
...@@ -67,8 +69,7 @@ def vgg16_bn_drop(input): ...@@ -67,8 +69,7 @@ def vgg16_bn_drop(input):
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None) fc1 = fluid.layers.fc(input=drop, size=512, act=None)
reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1))) bn = fluid.layers.batch_norm(input=fc1, act='relu')
bn = fluid.layers.batch_norm(input=reshape1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None) fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2 return fc2
...@@ -80,11 +81,18 @@ data_shape = [3, 32, 32] ...@@ -80,11 +81,18 @@ data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Add neural network config net_type = "vgg"
# option 1. resnet if len(sys.argv) >= 2:
# net = resnet_cifar10(images, 32) net_type = sys.argv[1]
# option 2. vgg
net = vgg16_bn_drop(images) if net_type == "vgg":
print("train vgg net")
net = vgg16_bn_drop(images)
elif net_type == "resnet":
print("train resnet")
net = resnet_cifar10(images, 32)
else:
raise ValueError("%s network is not supported" % net_type)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax') predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
......
...@@ -44,7 +44,7 @@ def db_lstm(): ...@@ -44,7 +44,7 @@ def db_lstm():
size=[pred_len, word_dim], size=[pred_len, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'vemb'}) param_attr='vemb')
mark_embedding = fluid.layers.embedding( mark_embedding = fluid.layers.embedding(
input=mark, input=mark,
...@@ -57,8 +57,8 @@ def db_lstm(): ...@@ -57,8 +57,8 @@ def db_lstm():
fluid.layers.embedding( fluid.layers.embedding(
size=[word_dict_len, word_dim], size=[word_dict_len, word_dim],
input=x, input=x,
param_attr={'name': embedding_name, param_attr=fluid.ParamAttr(
'trainable': False}) for x in word_input name=embedding_name, trainable=False)) for x in word_input
] ]
emb_layers.append(predicate_embedding) emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
...@@ -125,8 +125,8 @@ def main(): ...@@ -125,8 +125,8 @@ def main():
crf_cost = fluid.layers.linear_chain_crf( crf_cost = fluid.layers.linear_chain_crf(
input=feature_out, input=feature_out,
label=target, label=target,
param_attr={"name": 'crfw', param_attr=fluid.ParamAttr(
"learning_rate": mix_hidden_lr}) name='crfw', learning_rate=mix_hidden_lr))
avg_cost = fluid.layers.mean(x=crf_cost) avg_cost = fluid.layers.mean(x=crf_cost)
# TODO(qiao) # TODO(qiao)
# 1. add crf_decode_layer and evaluator # 1. add crf_decode_layer and evaluator
......
...@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid ...@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid
BATCH_SIZE = 128 BATCH_SIZE = 128
image = fluid.layers.data(name='x', shape=[784], dtype='float32') image = fluid.layers.data(name='x', shape=[784], dtype='float32')
param_attr = { regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
'name': None,
'regularization': fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
}
hidden1 = fluid.layers.fc(input=image, hidden1 = fluid.layers.fc(input=image,
size=128, size=128,
act='relu', act='relu',
param_attr=param_attr) param_attr=regularizer)
hidden2 = fluid.layers.fc(input=hidden1, hidden2 = fluid.layers.fc(input=hidden1,
size=64, size=64,
act='relu', act='relu',
param_attr=param_attr) param_attr=regularizer)
predict = fluid.layers.fc(input=hidden2, predict = fluid.layers.fc(input=hidden2,
size=10, size=10,
act='softmax', act='softmax',
param_attr=param_attr) param_attr=regularizer)
label = fluid.layers.data(name='y', shape=[1], dtype='int64') label = fluid.layers.data(name='y', shape=[1], dtype='int64')
...@@ -35,6 +32,13 @@ opts = optimizer.minimize(avg_cost) ...@@ -35,6 +32,13 @@ opts = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label) accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
test_accuracy = fluid.evaluator.Accuracy(
input=predict, label=label, main_program=inference_program)
test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
inference_program = fluid.io.get_inference_program(
test_target, main_program=inference_program)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192), paddle.dataset.mnist.train(), buf_size=8192),
...@@ -69,11 +73,6 @@ for pass_id in range(PASS_NUM): ...@@ -69,11 +73,6 @@ for pass_id in range(PASS_NUM):
acc = np.array(outs[1]) acc = np.array(outs[1])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
test_accuracy.reset(exe) test_accuracy.reset(exe)
for data in test_reader(): for data in test_reader():
x_data = np.array(map(lambda x: x[0], data)).astype("float32") x_data = np.array(map(lambda x: x[0], data)).astype("float32")
......
...@@ -24,7 +24,7 @@ def get_usr_combined_features(): ...@@ -24,7 +24,7 @@ def get_usr_combined_features():
input=uid, input=uid,
dtype='float32', dtype='float32',
size=[USR_DICT_SIZE, 32], size=[USR_DICT_SIZE, 32],
param_attr={'name': 'user_table'}, param_attr='user_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
usr_fc = layers.fc(input=usr_emb, size=32) usr_fc = layers.fc(input=usr_emb, size=32)
...@@ -36,7 +36,7 @@ def get_usr_combined_features(): ...@@ -36,7 +36,7 @@ def get_usr_combined_features():
usr_gender_emb = layers.embedding( usr_gender_emb = layers.embedding(
input=usr_gender_id, input=usr_gender_id,
size=[USR_GENDER_DICT_SIZE, 16], size=[USR_GENDER_DICT_SIZE, 16],
param_attr={'name': 'gender_table'}, param_attr='gender_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
...@@ -48,7 +48,7 @@ def get_usr_combined_features(): ...@@ -48,7 +48,7 @@ def get_usr_combined_features():
input=usr_age_id, input=usr_age_id,
size=[USR_AGE_DICT_SIZE, 16], size=[USR_AGE_DICT_SIZE, 16],
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'age_table'}) param_attr='age_table')
usr_age_fc = layers.fc(input=usr_age_emb, size=16) usr_age_fc = layers.fc(input=usr_age_emb, size=16)
...@@ -58,7 +58,7 @@ def get_usr_combined_features(): ...@@ -58,7 +58,7 @@ def get_usr_combined_features():
usr_job_emb = layers.embedding( usr_job_emb = layers.embedding(
input=usr_job_id, input=usr_job_id,
size=[USR_JOB_DICT_SIZE, 16], size=[USR_JOB_DICT_SIZE, 16],
param_attr={'name': 'job_table'}, param_attr='job_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
usr_job_fc = layers.fc(input=usr_job_emb, size=16) usr_job_fc = layers.fc(input=usr_job_emb, size=16)
...@@ -81,7 +81,7 @@ def get_mov_combined_features(): ...@@ -81,7 +81,7 @@ def get_mov_combined_features():
input=mov_id, input=mov_id,
dtype='float32', dtype='float32',
size=[MOV_DICT_SIZE, 32], size=[MOV_DICT_SIZE, 32],
param_attr={'name': 'movie_table'}, param_attr='movie_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
mov_fc = layers.fc(input=mov_emb, size=32) mov_fc = layers.fc(input=mov_emb, size=32)
......
...@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding( ...@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding(
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
embed_second = fluid.layers.embedding( embed_second = fluid.layers.embedding(
input=second_word, input=second_word,
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
embed_third = fluid.layers.embedding( embed_third = fluid.layers.embedding(
input=third_word, input=third_word,
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
embed_forth = fluid.layers.embedding( embed_forth = fluid.layers.embedding(
input=forth_word, input=forth_word,
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
concat_embed = fluid.layers.concat( concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1) input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
......
...@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core ...@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
...@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase): ...@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase):
append_backward_ops(total_sum_scaled) append_backward_ops(total_sum_scaled)
g_vars = map(g_main_program.global_block().var, g_vars = map(default_main_program().global_block().var,
[each_x.name + "@GRAD" for each_x in x]) [each_x.name + "@GRAD" for each_x in x])
g_out = [ g_out = [
item.sum() item.sum()
......
...@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set): ...@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
def _reference_training(x, scale, offset, epsilon, data_format): def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
if data_format == "NCHW": if data_format == "NCHW":
n, c, h, w = x.shape n, c, h, w = x.shape
x_square = x * x x_square = x * x
...@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format): ...@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
offset_tile = np.reshape(offset, (1, c, 1, 1)) offset_tile = np.reshape(offset, (1, c, 1, 1))
offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
y = normalized * scale_tile + offset_tile y = normalized * scale_tile + offset_tile
if len(x_shape) == 2:
y = np.reshape(y, (y.shape[0], y.shape[1]))
return y, mean, var return y, mean, var
elif data_format == "NHWC": elif data_format == "NHWC":
x_square = x * x x_square = x * x
...@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format): ...@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
mean = x_sum / element_count mean = x_sum / element_count
var = x_square_sum / element_count - mean * mean var = x_square_sum / element_count - mean * mean
normalized = (x - mean) / np.sqrt(var + epsilon) normalized = (x - mean) / np.sqrt(var + epsilon)
return (normalized * scale + offset), mean, var y = normalized * scale + offset
if len(x_shape) == 2:
y = np.reshape(y, x_shape)
return y, mean, var
else: else:
raise ValueError("Unknown data order.") raise ValueError("Unknown data order.")
...@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format): ...@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
# (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon)) # (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
# transfer from (N, C, H, W) to (N, H, W, C) to simplify computation # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], grad_y.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], 1, 1, grad_y.shape[1]))
if data_format == "NCHW": if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1)) x = np.transpose(x, (0, 2, 3, 1))
grad_y = np.transpose(grad_y, (0, 2, 3, 1)) grad_y = np.transpose(grad_y, (0, 2, 3, 1))
...@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format): ...@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
grad_x = np.transpose(grad_x, (0, 3, 1, 2)) grad_x = np.transpose(grad_x, (0, 3, 1, 2))
x = np.transpose(x, (0, 3, 1, 2)) x = np.transpose(x, (0, 3, 1, 2))
grad_y = np.transpose(grad_y, (0, 3, 1, 2)) grad_y = np.transpose(grad_y, (0, 3, 1, 2))
if len(x_shape) == 2:
grad_x = np.reshape(grad_x, x_shape)
return grad_x, grad_scale, grad_offset return grad_x, grad_scale, grad_offset
...@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest): ...@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
momentum = 0.9 momentum = 0.9
# N, H, W, C: 2, 3, 4, 2 # N, H, W, C: 2, 3, 4, 2
n, h, w, c = 2, 3, 4, 2 n, h, w, c = 2, 3, 4, 5
x_shape = [n, h, w, c] x_shape = [n, h, w, c]
scale_shape = [c] scale_shape = [c]
...@@ -184,14 +211,17 @@ class TestBatchNormOp(OpTest): ...@@ -184,14 +211,17 @@ class TestBatchNormOp(OpTest):
print 'python: NHWC, NCHW, backward checking passed' print 'python: NHWC, NCHW, backward checking passed'
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, tensor_format): def test_with_place(place, tensor_format, shape):
# attr # attr
epsilon = 0.00001 epsilon = 0.00001
momentum = 0.9 momentum = 0.9
# N, H, W, C: 12, 3, 4, 2 if len(shape) == 2:
n, h, w, c = 2, 3, 4, 2 x_shape = shape
c = shape[1]
else:
# n, h, w, c = 2, 3, 4, 2
n, h, w, c = shape[0], shape[1], shape[2], shape[3]
if data_format == "NHWC": if data_format == "NHWC":
x_shape = [n, h, w, c] x_shape = [n, h, w, c]
elif data_format == "NCHW": elif data_format == "NCHW":
...@@ -219,6 +249,9 @@ class TestBatchNormOp(OpTest): ...@@ -219,6 +249,9 @@ class TestBatchNormOp(OpTest):
# for gradient test # for gradient test
# y_grad = np.ones(x_shape).astype(np.float32) # y_grad = np.ones(x_shape).astype(np.float32)
y_grad = np.zeros(x_shape).astype(np.float32) y_grad = np.zeros(x_shape).astype(np.float32)
if len(y_grad.shape) == 2:
y_grad[0, 0] = 1.
else:
y_grad[0, 0, 0, 0] = 1. y_grad[0, 0, 0, 0] = 1.
# y_grad = np.random.random_sample(x_shape).astype(np.float32) # y_grad = np.random.random_sample(x_shape).astype(np.float32)
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad( x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
...@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest): ...@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
places.append(core.GPUPlace(0)) places.append(core.GPUPlace(0))
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format) test_with_place(place, data_format, [2, 3, 4, 5])
test_with_place(place, data_format, [2, 3])
if __name__ == '__main__': if __name__ == '__main__':
......
import unittest import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import g_startup_program, g_main_program from paddle.v2.fluid.framework import default_startup_program, default_main_program
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
import numpy import numpy
...@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase): ...@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase):
cpu = core.CPUPlace() cpu = core.CPUPlace()
exe = Executor(cpu) exe = Executor(cpu)
exe.run(g_startup_program) exe.run(default_startup_program())
x = numpy.random.random(size=(10, 1)).astype('float32') x = numpy.random.random(size=(10, 1)).astype('float32')
...@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase): ...@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase):
append_backward_ops(loss=loss) append_backward_ops(loss=loss)
outs = exe.run( outs = exe.run(
feed={'X': x}, feed={'X': x},
fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0] fetch_list=[
default_main_program().block(0).var(data.name + "@GRAD")
])[0]
print outs print outs
......
import unittest import unittest
from paddle.v2.fluid.layers import mul, data, sequence_pool
import numpy
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.layers import mul, data
import numpy
class TestExecutor(unittest.TestCase): class TestExecutor(unittest.TestCase):
...@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase): ...@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase):
a_np = numpy.random.random((100, 784)).astype('float32') a_np = numpy.random.random((100, 784)).astype('float32')
b_np = numpy.random.random((784, 100)).astype('float32') b_np = numpy.random.random((784, 100)).astype('float32')
exe = Executor(place) exe = Executor(place)
outs = exe.run(g_main_program, outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
feed={'a': a_np,
'b': b_np},
fetch_list=[out])
out = outs[0] out = outs[0]
self.assertEqual((100, 100), out.shape) self.assertEqual((100, 100), out.shape)
self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np))) self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
......
import unittest import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid as fluid
import paddle.v2.fluid.nets as nets import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program
...@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase): ...@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
def test_batch_norm_layer(self): def test_batch_norm_layer(self):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
images = layers.data( images = fluid.layers.data(
name='pixel', name='pixel',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program) main_program=main_program)
layers.batch_norm( hidden1 = fluid.layers.batch_norm(
input=images, input=images,
main_program=main_program, main_program=main_program,
startup_program=startup_program) startup_program=startup_program)
hidden2 = fluid.layers.fc(input=hidden1,
size=128,
act='relu',
main_program=main_program)
hidden3 = fluid.layers.batch_norm(
input=hidden2,
main_program=main_program,
startup_program=startup_program)
# print str(main_program) print str(main_program)
def test_dropout_layer(self): def test_dropout_layer(self):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
images = layers.data( images = fluid.layers.data(
name='pixel', name='pixel',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program) main_program=main_program)
layers.dropout( fluid.layers.dropout(
x=images, x=images,
dropout_prob=0.5, dropout_prob=0.5,
main_program=main_program, main_program=main_program,
...@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase): ...@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
images = layers.data( images = fluid.layers.data(
name='pixel', name='pixel',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
...@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase): ...@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
def test_elementwise_add_with_act(self): def test_elementwise_add_with_act(self):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
image1 = layers.data( image1 = fluid.layers.data(
name='pixel1', name='pixel1',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program, main_program=main_program,
startup_program=startup_program) startup_program=startup_program)
image2 = layers.data( image2 = fluid.layers.data(
name='pixel2', name='pixel2',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program, main_program=main_program,
startup_program=startup_program) startup_program=startup_program)
out = layers.elementwise_add( out = fluid.layers.elementwise_add(
x=image1, x=image1,
y=image2, y=image2,
act='relu', act='relu',
......
from __future__ import print_function
import unittest import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program, program_guard
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
def test_fit_a_line(self): def test_fit_a_line(self):
program = Program() program = Program()
x = layers.data( with program_guard(program, startup_program=Program()):
name='x', shape=[13], dtype='float32', main_program=program) x = layers.data(name='x', shape=[13], dtype='float32')
y_predict = layers.fc(input=x, size=1, act=None, main_program=program) y_predict = layers.fc(input=x, size=1, act=None)
y = layers.data(name='y', shape=[1], dtype='float32')
y = layers.data( cost = layers.square_error_cost(input=y_predict, label=y)
name='y', shape=[1], dtype='float32', main_program=program) avg_cost = layers.mean(x=cost)
cost = layers.square_error_cost(
input=y_predict, label=y, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
program.append_backward(avg_cost) program.append_backward(avg_cost)
print str(program) print(str(program))
def test_recognize_digits_mlp(self): def test_recognize_digits_mlp(self):
program = Program() program = Program()
with program_guard(program, startup_program=Program()):
# Change g_program, so the rest layers use `g_program` # Change g_program, so the rest layers use `g_program`
images = layers.data( images = layers.data(name='pixel', shape=[784], dtype='float32')
name='pixel', shape=[784], dtype='float32', main_program=program) label = layers.data(name='label', shape=[1], dtype='int32')
label = layers.data( hidden1 = layers.fc(input=images, size=128, act='relu')
name='label', shape=[1], dtype='int32', main_program=program) hidden2 = layers.fc(input=hidden1, size=64, act='relu')
hidden1 = layers.fc(input=images, predict = layers.fc(input=hidden2, size=10, act='softmax')
size=128, cost = layers.cross_entropy(input=predict, label=label)
act='relu', avg_cost = layers.mean(x=cost)
main_program=program)
hidden2 = layers.fc(input=hidden1,
size=64,
act='relu',
main_program=program)
predict = layers.fc(input=hidden2,
size=10,
act='softmax',
main_program=program)
cost = layers.cross_entropy(
input=predict, label=label, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
print str(program) print(str(program))
def test_simple_conv2d(self): def test_simple_conv2d(self):
program = Program() program = Program()
images = layers.data( with program_guard(program, startup_program=Program()):
name='pixel', images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
shape=[3, 48, 48], layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
dtype='int32',
main_program=program)
layers.conv2d(
input=images,
num_filters=3,
filter_size=[4, 4],
main_program=program)
print str(program) print(str(program))
def test_recognize_digits_conv(self): def test_conv2d_transpose(self):
program = Program() program = Program()
with program_guard(program):
img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
print(str(program))
def test_recognize_digits_conv(self):
program = Program()
with program_guard(program, startup_program=Program()):
images = layers.data( images = layers.data(
name='pixel', name='pixel', shape=[1, 28, 28], dtype='float32')
shape=[1, 28, 28], label = layers.data(name='label', shape=[1], dtype='int32')
dtype='float32',
main_program=program)
label = layers.data(
name='label', shape=[1], dtype='int32', main_program=program)
conv_pool_1 = nets.simple_img_conv_pool( conv_pool_1 = nets.simple_img_conv_pool(
input=images, input=images,
filter_size=5, filter_size=5,
num_filters=2, num_filters=2,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu")
main_program=program)
conv_pool_2 = nets.simple_img_conv_pool( conv_pool_2 = nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
num_filters=4, num_filters=4,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu")
main_program=program)
predict = layers.fc(input=conv_pool_2, predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
size=10, cost = layers.cross_entropy(input=predict, label=label)
act="softmax", avg_cost = layers.mean(x=cost)
main_program=program)
cost = layers.cross_entropy(
input=predict, label=label, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
program.append_backward(avg_cost) program.append_backward(avg_cost)
print str(program) print(str(program))
def test_word_embedding(self): def test_word_embedding(self):
program = Program() program = Program()
with program_guard(program, startup_program=Program()):
dict_size = 10000 dict_size = 10000
embed_size = 32 embed_size = 32
first_word = layers.data( first_word = layers.data(name='firstw', shape=[1], dtype='int64')
name='firstw', shape=[1], dtype='int64', main_program=program) second_word = layers.data(name='secondw', shape=[1], dtype='int64')
second_word = layers.data( third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
name='secondw', shape=[1], dtype='int64', main_program=program) forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
third_word = layers.data( next_word = layers.data(name='nextw', shape=[1], dtype='int64')
name='thirdw', shape=[1], dtype='int64', main_program=program)
forth_word = layers.data(
name='forthw', shape=[1], dtype='int64', main_program=program)
next_word = layers.data(
name='nextw', shape=[1], dtype='int64', main_program=program)
embed_first = layers.embedding( embed_first = layers.embedding(
input=first_word, input=first_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
embed_second = layers.embedding( embed_second = layers.embedding(
input=second_word, input=second_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
embed_third = layers.embedding( embed_third = layers.embedding(
input=third_word, input=third_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
embed_forth = layers.embedding( embed_forth = layers.embedding(
input=forth_word, input=forth_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
concat_embed = layers.concat( concat_embed = layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], input=[embed_first, embed_second, embed_third, embed_forth],
axis=1, axis=1)
main_program=program)
hidden1 = layers.fc(input=concat_embed, hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
size=256,
act='sigmoid',
main_program=program)
predict_word = layers.fc(input=hidden1, predict_word = layers.fc(input=hidden1,
size=dict_size, size=dict_size,
act='softmax', act='softmax')
main_program=program) cost = layers.cross_entropy(input=predict_word, label=next_word)
cost = layers.cross_entropy( avg_cost = layers.mean(x=cost)
input=predict_word, label=next_word, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
print str(program) print(str(program))
def test_linear_chain_crf(self): def test_linear_chain_crf(self):
program = Program() program = Program()
with program_guard(program, startup_program=Program()):
# Change g_program, so the rest layers use `g_program` images = layers.data(name='pixel', shape=[784], dtype='float32')
images = layers.data( label = layers.data(name='label', shape=[1], dtype='int32')
name='pixel', shape=[784], dtype='float32', main_program=program) hidden = layers.fc(input=images, size=128)
label = layers.data( crf = layers.linear_chain_crf(input=hidden, label=label)
name='label', shape=[1], dtype='int32', main_program=program) self.assertNotEqual(crf, None)
hidden = layers.fc(input=images, size=128, main_program=program)
crf = layers.linear_chain_crf( print(str(program))
input=hidden, label=label, main_program=program)
print str(program)
if __name__ == '__main__': if __name__ == '__main__':
......
from paddle.v2.fluid.layers import lod_rank_table, data from paddle.v2.fluid.layers import lod_rank_table, data
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import g_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import numpy import numpy
import unittest import unittest
...@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase): ...@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(numpy.random.random(size=(17, 100)), cpu) tensor.set(numpy.random.random(size=(17, 100)), cpu)
tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]]) tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
exe.run(g_main_program, scope=scope, feed={'x': tensor}) exe.run(scope=scope, feed={'x': tensor})
var = scope.find_var(rank_table.name) var = scope.find_var(rank_table.name)
table = var.get_lod_rank_table() table = var.get_lod_rank_table()
self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items()) self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
......
import unittest
import numpy as np
from op_test import OpTest
class TestLogLossOp(OpTest):
def setUp(self):
self.op_type = 'log_loss'
samples_num = 32
predicted = np.random.uniform(0.1, 1.0,
(samples_num, 1)).astype("float32")
labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
epsilon = 1e-4
self.inputs = {
'Predicted': predicted,
'Labels': labels,
}
self.attrs = {'epsilon': epsilon}
loss = -labels * np.log(predicted + epsilon) - (
1 - labels) * np.log(1 - predicted + epsilon)
self.outputs = {'Loss': loss}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
if __name__ == '__main__':
unittest.main()
...@@ -30,9 +30,7 @@ class TestMaxOutOp(OpTest): ...@@ -30,9 +30,7 @@ class TestMaxOutOp(OpTest):
def init_test_case(self): def init_test_case(self):
self.MaxOut_forward_naive = maxout_forward_naive self.MaxOut_forward_naive = maxout_forward_naive
self.shape = [100, 6, 2, 2] self.shape = [100, 6, 2, 2]
self.groups=2 self.groups = 2
if __name__ == '__main__': if __name__ == '__main__':
......
import unittest import unittest
from paddle.v2.fluid.framework import Variable, Program, g_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import Program, default_startup_program
main_program = default_startup_program()
class TestOperator(unittest.TestCase): class TestOperator(unittest.TestCase):
def test_error_type(self): def test_error_type(self):
block = g_main_program.create_block() block = main_program.create_block()
try: try:
block.append_op() block.append_op()
self.assertFail() self.assertFail()
......
import unittest import unittest
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import default_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.io as io import paddle.v2.fluid.io as io
from paddle.v2.fluid.initializer import ConstantInitializer from paddle.v2.fluid.initializer import ConstantInitializer
import numpy as np import numpy as np
main_program = default_main_program()
class TestParameter(unittest.TestCase): class TestParameter(unittest.TestCase):
def test_param(self): def test_param(self):
shape = [784, 100] shape = [784, 100]
val = 1.0625 val = 1.0625
b = g_main_program.global_block() b = main_program.global_block()
param = b.create_parameter( param = b.create_parameter(
name='fc.w', name='fc.w',
shape=shape, shape=shape,
...@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase): ...@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
self.assertEqual(core.DataType.FP32, param.dtype) self.assertEqual(core.DataType.FP32, param.dtype)
self.assertEqual(0, param.block.idx) self.assertEqual(0, param.block.idx)
exe = Executor(core.CPUPlace()) exe = Executor(core.CPUPlace())
p = exe.run(g_main_program, fetch_list=[param])[0] p = exe.run(main_program, fetch_list=[param])[0]
self.assertTrue(np.allclose(p, np.ones(shape) * val)) self.assertTrue(np.allclose(p, np.ones(shape) * val))
p = io.get_parameter_value_by_name('fc.w', exe, g_main_program) p = io.get_parameter_value_by_name('fc.w', exe, main_program)
self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val)) self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
......
import unittest
import numpy as np
import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers
class TestProfiler(unittest.TestCase):
def test_nvprof(self):
if not fluid.core.is_compile_gpu():
return
epoc = 8
dshape = [4, 3, 28, 28]
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.GPUPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
if __name__ == '__main__':
unittest.main()
from __future__ import print_function
import unittest import unittest
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program, default_main_program
from paddle.v2.fluid.framework import g_main_program import paddle.v2.fluid.layers as layers
main_program = default_main_program()
class TestProgram(unittest.TestCase): class TestProgram(unittest.TestCase):
def test_program(self): def test_program(self):
b = g_main_program.current_block() b = main_program.current_block()
self.assertEqual(-1, b.parent_idx) self.assertEqual(-1, b.parent_idx)
self.assertEqual(0, b.idx) self.assertEqual(0, b.idx)
b = g_main_program.create_block() b = main_program.create_block()
self.assertEqual(1, b.idx) self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx) self.assertEqual(0, b.parent_idx)
b = g_main_program.create_block() b = main_program.create_block()
self.assertEqual(2, b.idx) self.assertEqual(2, b.idx)
self.assertEqual(1, b.parent_idx) self.assertEqual(1, b.parent_idx)
g_main_program.rollback() main_program.rollback()
b = g_main_program.current_block() b = main_program.current_block()
self.assertEqual(1, b.idx) self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx) self.assertEqual(0, b.parent_idx)
b = g_main_program.create_block() b = main_program.create_block()
self.assertEqual(3, b.idx) self.assertEqual(3, b.idx)
self.assertEqual(1, b.parent_idx) self.assertEqual(1, b.parent_idx)
g_main_program.rollback() main_program.rollback()
b = g_main_program.current_block() b = main_program.current_block()
self.assertEqual(1, b.idx) self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx) self.assertEqual(0, b.parent_idx)
...@@ -48,8 +51,8 @@ class TestProgram(unittest.TestCase): ...@@ -48,8 +51,8 @@ class TestProgram(unittest.TestCase):
# FIXME(yuyang18): We manual compare the output string, since the order # FIXME(yuyang18): We manual compare the output string, since the order
# of variable could be changed. # of variable could be changed.
print prog print(prog)
print prog.clone() print(prog.clone())
def test_parse_program_from_string(self): def test_parse_program_from_string(self):
prog = Program() prog = Program()
...@@ -67,8 +70,8 @@ class TestProgram(unittest.TestCase): ...@@ -67,8 +70,8 @@ class TestProgram(unittest.TestCase):
binary_str = prog.desc.serialize_to_string() binary_str = prog.desc.serialize_to_string()
prog_restored = Program.parse_from_string(binary_str) prog_restored = Program.parse_from_string(binary_str)
print prog print(prog)
print prog_restored print(prog_restored)
def test_append_backward(self): def test_append_backward(self):
prog = Program() prog = Program()
...@@ -123,6 +126,20 @@ class TestProgram(unittest.TestCase): ...@@ -123,6 +126,20 @@ class TestProgram(unittest.TestCase):
actual_ops.append(op.type) actual_ops.append(op.type)
self.assertEqual(actual_ops, expect_ops) self.assertEqual(actual_ops, expect_ops)
def test_program_clone_with_parameter(self):
main_program = Program()
startup_program = Program()
kwargs = {
'main_program': main_program,
'startup_program': startup_program
}
d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
hidden = layers.fc(input=d, size=100, **kwargs)
layers.fc(input=hidden, size=100, **kwargs)
new_program = main_program.clone()
self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1): ...@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
temp_l = layers.fc(input=x_t, temp_l = layers.fc(input=x_t,
size=self.input_dim, size=self.input_dim,
param_attr={'name': 'W'}, param_attr='W',
bias_attr=False, bias_attr=False,
**self.p_info) **self.p_info)
temp_r = layers.fc(input=h_pre, temp_r = layers.fc(input=h_pre,
size=self.input_dim, size=self.input_dim,
param_attr={'name': 'U'}, param_attr='U',
bias_attr=False, bias_attr=False,
**self.p_info) **self.p_info)
......
...@@ -4,24 +4,22 @@ import math ...@@ -4,24 +4,22 @@ import math
import sys import sys
from op_test import OpTest from op_test import OpTest
class TestROIPoolOp(OpTest): class TestROIPoolOp(OpTest):
def set_data(self): def set_data(self):
self.init_test_case() self.init_test_case()
self.make_rois() self.make_rois()
self.calc_roi_pool() self.calc_roi_pool()
self.inputs = { self.inputs = {'X': self.x, 'ROIs': self.rois}
'X': self.x,
'ROIs': self.rois}
self.attrs = { self.attrs = {
'spatial_scale': self.spatial_scale, 'spatial_scale': self.spatial_scale,
'pooled_height': self.pooled_height, 'pooled_height': self.pooled_height,
'pooled_width': self.pooled_width} 'pooled_width': self.pooled_width
}
self.outputs = { self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
'Out': self.outs,
'Argmax': self.argmaxes}
def init_test_case(self): def init_test_case(self):
self.batch_size = 5 self.batch_size = 5
...@@ -30,10 +28,9 @@ class TestROIPoolOp(OpTest): ...@@ -30,10 +28,9 @@ class TestROIPoolOp(OpTest):
self.width = 4 self.width = 4
# n, c, h, w # n, c, h, w
self.x_dim = (self.batch_size, self.channels, self.x_dim = (self.batch_size, self.channels, self.height, self.width)
self.height, self.width)
self.spatial_scale = 1.0/4.0 self.spatial_scale = 1.0 / 4.0
self.pooled_height = 2 self.pooled_height = 2
self.pooled_width = 2 self.pooled_width = 2
self.rois_num = 2 self.rois_num = 2
...@@ -41,11 +38,9 @@ class TestROIPoolOp(OpTest): ...@@ -41,11 +38,9 @@ class TestROIPoolOp(OpTest):
self.x = np.random.random(self.x_dim).astype('float32') self.x = np.random.random(self.x_dim).astype('float32')
def calc_roi_pool(self): def calc_roi_pool(self):
out_data = np.zeros( out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
(self.rois_num, self.channels, self.pooled_width))
self.pooled_height, self.pooled_width)) argmax_data = np.zeros((self.rois_num, self.channels,
argmax_data = np.zeros(
(self.rois_num, self.channels,
self.pooled_height, self.pooled_width)) self.pooled_height, self.pooled_width))
for i in range(self.rois_num): for i in range(self.rois_num):
...@@ -56,8 +51,8 @@ class TestROIPoolOp(OpTest): ...@@ -56,8 +51,8 @@ class TestROIPoolOp(OpTest):
roi_end_w = int(round(roi[3] * self.spatial_scale)) roi_end_w = int(round(roi[3] * self.spatial_scale))
roi_end_h = int(round(roi[4] * self.spatial_scale)) roi_end_h = int(round(roi[4] * self.spatial_scale))
roi_height = int(max(roi_end_h - roi_start_h + 1, 1)); roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
roi_width = int(max(roi_end_w - roi_start_w + 1, 1)); roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
x_i = self.x[roi_batch_id] x_i = self.x[roi_batch_id]
...@@ -104,10 +99,10 @@ class TestROIPoolOp(OpTest): ...@@ -104,10 +99,10 @@ class TestROIPoolOp(OpTest):
y1 = np.random.random_integers( y1 = np.random.random_integers(
0, self.height / self.spatial_scale - self.pooled_height) 0, self.height / self.spatial_scale - self.pooled_height)
x2 = np.random.random_integers( x2 = np.random.random_integers(x1 + self.pooled_width,
x1 + self.pooled_width, self.width / self.spatial_scale) self.width / self.spatial_scale)
y2 = np.random.random_integers( y2 = np.random.random_integers(y1 + self.pooled_height,
y1 + self.pooled_height, self.height / self.spatial_scale) self.height / self.spatial_scale)
roi = [batch_ids[i], x1, y1, x2, y2] roi = [batch_ids[i], x1, y1, x2, y2]
rois.append(roi) rois.append(roi)
...@@ -123,5 +118,6 @@ class TestROIPoolOp(OpTest): ...@@ -123,5 +118,6 @@ class TestROIPoolOp(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core ...@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
main_program = default_main_program()
class TestShrinkRNNMemory(unittest.TestCase): class TestShrinkRNNMemory(unittest.TestCase):
def test_shrink_rnn_memory(self): def test_shrink_rnn_memory(self):
...@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase): ...@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
append_backward_ops(loss=mem3_mean) append_backward_ops(loss=mem3_mean)
x_grad = exe.run( x_grad = exe.run(
feed={'x': tensor}, feed={'x': tensor},
fetch_list=[g_main_program.global_block().var('x@GRAD')])[0] fetch_list=[main_program.global_block().var('x@GRAD')])[0]
self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1) self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
......
import unittest
import numpy as np
from op_test import OpTest
def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
s0, s1, s2, s3 = input.shape
out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
out = np.zeros((s0, s1, out_hsize, out_wsize))
for nidx in xrange(s0):
for cidx in xrange(s1):
for h in xrange(s2):
for w in xrange(s3):
index = indices[nidx, cidx, h, w]
hidx = (index - index % out_wsize) / out_wsize
widx = index % out_wsize
out[nidx, cidx, int(hidx), int(widx)] = \
input[nidx, cidx, h, w]
return out
class TestUnpoolOp(OpTest):
def setUp(self):
self.op_type = "unpool"
self.init_test_case()
pre_input = np.random.random(self.shape).astype("float32")
nsize, csize, hsize, wsize = pre_input.shape
hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
self.strides[0] + 1
wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
self.strides[1] + 1
input = np.zeros((nsize, csize, hsize_out, wsize_out))
indices = np.zeros((nsize, csize, hsize_out, wsize_out))
for i in xrange(hsize_out):
for j in xrange(wsize_out):
r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
r_end = np.min((i * self.strides[0] + self.ksize[0] - \
self.paddings[0], hsize))
c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
c_end = np.min((j * self.strides[1] + self.ksize[1] - \
self.paddings[1], wsize))
for nidx in xrange(nsize):
for cidx in xrange(csize):
x_masked = pre_input[nidx, cidx, r_start:r_end, \
c_start:c_end]
input[nidx, cidx, i, j] = x_masked.max()
arg = x_masked.argmax()
indices[nidx, cidx, i, j] = \
(r_start + arg / self.ksize[1]) * wsize + \
c_start + arg % self.ksize[1]
output = self.unpool2d_forward_naive(input, indices, self.ksize, \
self.strides, self.paddings).astype("float32")
self.inputs = {
'X': input.astype('float32'),
'Indices': indices.astype('int32')
}
self.attrs = {
'strides': self.strides,
'paddings': self.paddings,
'ksize': self.ksize,
'unpooling_type': self.unpooling_type,
}
self.outputs = {'Out': output.astype('float32')}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
def init_test_case(self):
self.unpool2d_forward_naive = unpool2dmax_forward_naive
self.unpooling_type = "max"
self.shape = [6, 4, 5, 5]
self.ksize = [3, 3]
self.strides = [2, 2]
self.paddings = [0, 0]
if __name__ == '__main__':
unittest.main()
import unittest import unittest
from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_ from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import numpy as np import numpy as np
...@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase): ...@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
self.assertRaises(ValueError, lambda: convert("int8")) self.assertRaises(ValueError, lambda: convert("int8"))
def test_var(self): def test_var(self):
b = g_main_program.current_block() b = default_main_program().current_block()
w = b.create_var( w = b.create_var(
dtype="float64", shape=[784, 100], lod_level=0, name="fc.w") dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
self.assertNotEqual(str(w), "") self.assertNotEqual(str(w), "")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册