diff --git a/.clang-format b/.clang-format
index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
...
-
diff --git a/.travis.yml b/.travis.yml
index c51e02eb79a9e53a2b8d1d663e8f0c3e0d8c3a61..e2d49daa1981396628efa5d16459eb70e9e76884 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
script:
- |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
- RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+ RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
- |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65164b8472b902be8b0b9d5fb99807d012b8a666..e76512166fcaea5daf2a67d1259331b680f15b7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,8 @@ include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/nccl)
+include(external/cares)
+include(external/grpc)
include(cudnn) # set cudnn libraries, must before configure
include(configure) # add paddle env configuration
diff --git a/Dockerfile b/Dockerfile
index 150344a8116e2be9b5bab8e5fdcc9c37f4025020..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
- net-tools && \
+ net-tools libtool && \
apt-get clean -y
# Install Go and glide
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e05111ee18efc906e39bcb56fb1be3b3c3dff5d6
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+ return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+ extern_cares
+ GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+ GIT_TAG "cares-1_13_0"
+ PREFIX ${CARES_SOURCES_DIR}
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+ BUILD_IN_SOURCE 1
+ BUILD_COMMAND make
+ INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+ "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..219ea1b90881ccdbaf3fd41510fb4f2a8b6ec0f4
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+ return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+ SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
+ELSE()
+ SET(BUILD_CMD make)
+ENDIF()
+
+ExternalProject_Add(
+ extern_grpc
+ DEPENDS protobuf zlib
+ GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+ GIT_TAG "v1.7.x"
+ PREFIX ${GRPC_SOURCES_DIR}
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_IN_SOURCE 1
+ # NOTE(yuyang18):
+ # Disable -Werror, otherwise the compile will fail in MacOS.
+ # It seems that we cannot configure that by make command.
+ # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+ BUILD_COMMAND ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin
+ INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+ "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+ "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+ "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+ "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465970711170bd15dcecaadeaa8a55f86..7cfe1e68078eed023fd0cc6971c573bb0108b4cc 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
INCLUDE(ExternalProject)
# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+ UNSET(${VAR_NAME} CACHE)
+ UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
# FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
# make `protobuf_generate_cpp` happy.
SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
FOREACH(dep ${protobuf_DEPS})
ADD_DEPENDENCIES(protobuf ${dep})
ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
- find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
- find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
- find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
- find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
- find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+ find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+ find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
SET_PROTOBUF_VERSION()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index a98e069b7cd1654ddd5868560d0905eab6d9c692..1638cd8fdfc34575132462859e056a1907f0b2f1 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
)
LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7b82d409a3b64a5fc8fdfe526a2e82a4e1c9fa8e..c917ca0ff4e087b7caae8876da127bec6b39b798 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -467,3 +467,50 @@ function(py_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+ set(oneValueArgs PROTO)
+ set(multiValueArgs SRCS DEPS)
+ set(options "")
+ cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+ message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+ get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+ get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+ get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+ protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+ set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+ set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+ cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+ add_custom_command(
+ OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+ COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+ ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+ --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+ DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+ # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+ # as compiler warnings instead of error. Should try remove the warnings also.
+ set_source_files_properties(
+ ${grpc_grpc_srcs}
+ PROPERTIES
+ COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+ cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+ set_source_files_properties(
+ ${grpc_library_SRCS}
+ PROPERTIES
+ COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+ cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4d182f6692e09b3e40f3620b77d9a0f20ec5af3..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
.. _api_v2.layer_context_projection:
-context_projection
+context_projection
------------------
.. autoclass:: paddle.v2.layer.context_projection
:noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
img_pool
--------
.. autoclass:: paddle.v2.layer.img_pool
- :noindex:
+ :noindex:
spp
---
@@ -104,7 +104,7 @@ sum_to_one_norm
---------------
.. autoclass:: paddle.v2.layer.sum_to_one_norm
:noindex:
-
+
cross_channel_norm
------------------
.. autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
-----------
.. autoclass:: paddle.v2.layer.row_l2_norm
:noindex:
-
+
Recurrent Layers
================
@@ -415,6 +415,13 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex
:noindex:
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+.. autoclass:: paddle.v2.layer.factorization_machine
+ :noindex:
Slicing and Joining Layers
==========================
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index ac7e98ccf1aadbb973a4801fde842375cf63448c..2b4f921ae93c3b443ed62a28b1fa9fbda14f73ab 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
## Abstract
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
-1. Need to write special code to handle tasks which should only be run
- by a single trainer. E.g., initializing model and saving model.
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
-2. Model parallelism is hard: need to write if-else branches conditioned
- on the trainer ID to partition model onto each trainer, and manually
- write the inter-model-shard communication code.
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
-3. The user can not directly specify the parameter update rule: need
- to modify the parameter server C++ code and compile a new
- binary. This adds complication for researchers: A lot of extra
- effort is required. Besides, the training job submission program
- may not allow running arbitrary binaries.
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
## Analysis
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
+The assumption is that the user writes the trainer program in either Python or C++.
### Limitation 1
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
+There are two basic functionalities in the trainer program:
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
- layer, the fully connected layer, the cost function and the
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
optimizer.
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
### Limitation 2
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
### Limitation 3
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
## Distributed Training Architecture
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
### PaddlePaddle Python
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
```Python
paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
print cost_val
```
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
#### session.eval
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
### PaddlePaddle Converter
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
- fetches the eval targets to the IR.
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
- the boundary. The runtime does not need to run the OP that is not
- dependent by the `fetch` OP.
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
-1. Place the OPs in the graph onto different devices on different
- PaddlePaddle runtime according to a placement algorithm and device
- constraint specified by the user.
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
-1. Partition the graph according to runtime boundaries and add `send` /
- `recv` OP pair on the runtime boundaries.
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
- results back to the converter, the convert reports the evaluation
- results back to the PaddlePaddle Python.
-
The output IRs will be cached to optimize the conversion latency.
#### Placement Algorithm
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
### PaddlePaddle Runtime
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
### Local Training Architecture
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
### Training Data
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
+
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
## References:
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 55665ac8edfcf20290936fba4c3e410b33e1f3d4..3c525bdad6f6118dcd560e2cb7bfaf89737c1362 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -1,4 +1,4 @@
-从源码编译PaddlePaddle
+从源码编译
======================
.. _build_step:
@@ -7,8 +7,11 @@
----------------
PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
可以在 `这里 `_ 找到。
+
+如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
编译PaddlePaddle,需要执行:
.. code-block:: bash
@@ -22,7 +25,6 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
-
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
@@ -31,7 +33,33 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译
pip install python/dist/*.whl
-.. _build_step:
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
+
+使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+如果不使用Docker,可以执行ctest命令即可:
+
+.. code-block:: bash
+
+ mkdir build
+ cd build
+ cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+ make
+ ctest
+ # 指定执行其中一个单元测试 test_mul_op
+ ctest -R test_mul_op
+
+.. _compile_deps:
编译依赖
----------------
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 9a3ed7dd57137ddf3d6213222c17433822b01dbb..76fbc43de2e83580dd79b874507c103533022436 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -1,4 +1,4 @@
-Build PaddlePaddle from Sources
+Build from Sources
==========================
.. _build_step:
@@ -9,14 +9,18 @@ How To Build
PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile
tools. We recommend you to use our pre-built Docker image to run the build
to avoid installing dependencies by yourself. We have several build environment
-Docker images `here `_.
+Docker images `here `_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
Then run:
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
- # run the following command to build CPU-Only binaries if you are using docker
+ # run the following command to build a CPU-Only binaries if you are using docker
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
# else run these commands
mkdir build
@@ -32,7 +36,35 @@ machine or copy it to the target machine.
pip install python/dist/*.whl
-.. _build_step:
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+ mkdir build
+ cd build
+ cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+ make
+ ctest
+ # run a single test like test_mul_op
+ ctest -R test_mul_op
+
+
+.. _compile_deps:
Compile Dependencies
----------------
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 07933b2e0bbca809f6c4e90e7ff8f71d1b3304b2..f78b1fb0e11aa028a4b7abb5270740b97f8039e9 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,4 +1,4 @@
-使用Docker安装运行PaddlePaddle
+使用Docker安装运行
================================
使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 9b977c9c72e36b4b47cbf56ae848ab83d9895783..d7acc7aeb744b19d83acb520d07c8551168dd096 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,4 +1,4 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
=================================
Run PaddlePaddle in Docker container so that you don't need to care about
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
index 41312da48c055826186a560ef9653653e45d1047..b26bf4c95cb18f36408eb75894e8b9b674efc67b 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -1,4 +1,4 @@
-使用pip安装PaddlePaddle
+使用pip安装
================================
PaddlePaddle可以使用常用的Python包管理工具
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
index 4f295e14baa1465a93b8eef1b3f3b6b47eeea905..113790e4e4ca116e91f11f8a233eae874d9d1b7a 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -1,4 +1,4 @@
-Install PaddlePaddle Using pip
+Install Using pip
================================
You can use current widely used Python package management
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 76d3e0a0092f89005605a23e14e712530112a5ac..eb95356c67c5df22e4f543f958eb31d79f2c6195 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,7 +19,6 @@
.. toctree::
:maxdepth: 1
- dev/build_cn.rst
dev/write_docs_cn.rst
模型配置
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1b6034be4edffd2cbc822018b733b9a3836ea84a..1fbfcd260b912078f00ed5b720ed607db725c4e2 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,7 +18,6 @@ Development
.. toctree::
:maxdepth: 1
- dev/build_en.rst
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a0f2906c749054c1ff9f624e47df432ec2bd6ac8..fdf6de4babff3bb3c253aaf516636882237e6faf 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
limitations under the License. */
#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
namespace paddle {
namespace framework {
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
os << "{";
- for (auto& v : lod) {
+ for (auto &v : lod) {
os << "{";
- for (auto& i : v) {
+ for (auto &i : v) {
os << i << ",";
}
os << "}";
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
return os;
}
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
LoD new_lod;
new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) {
@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
return new_lod;
}
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
size_t elem_end) {
PADDLE_ENFORCE_LT(level, in.size());
PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
res[0].assign(in[level].begin() + elem_begin,
in[level].begin() + elem_end + 1);
for (size_t lvl = 1; lvl < res.size(); lvl++) {
- const auto& in_level = in[level + lvl];
- const auto& above_level = res[lvl - 1];
- auto& out_level = res[lvl];
+ const auto &in_level = in[level + lvl];
+ const auto &above_level = res[lvl - 1];
+ auto &out_level = res[lvl];
out_level.assign(in_level.begin() + above_level.front(),
in_level.begin() + above_level.back() + 1);
}
@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
// to make the first offset equals 0, all the elements minus the first
// element
size_t front = res[lvl].front();
- for (auto& ele : res[lvl]) {
+ for (auto &ele : res[lvl]) {
ele -= front;
}
}
return res;
}
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in;
LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) {
- for (auto& ele : result[level]) {
+ for (auto &ele : result[level]) {
ele = result[level + 1][ele];
}
}
return result;
}
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
if (a.size() != b.size()) {
return false;
}
for (size_t i = 0; i < a.size(); i++) {
- const auto& a_level = a[i];
- const auto& b_level = b[i];
+ const auto &a_level = a[i];
+ const auto &b_level = b[i];
if (a_level.size() != b_level.size()) {
return false;
}
@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
}
using LoDAndOffset = std::pair>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) {
LoD sub_lod;
@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
return LoDAndOffset{sub_lod, {start_idx, end_idx}};
}
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
PADDLE_ENFORCE(
lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
*lod = LoD(lod_length.size(), std::vector({0}));
}
for (size_t i = 0; i < lod->size(); ++i) {
- auto& level = (*lod)[i];
+ auto &level = (*lod)[i];
for (size_t len : lod_length[i]) {
level.push_back(level.back() + len);
}
}
}
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+ const platform::DeviceContext &dev_ctx) {
+ // TODO(typhoonzero): serialize to ostream
+ { // the 1st field, uint32_t version
+ constexpr uint32_t version = 0;
+ os.write(reinterpret_cast(&version), sizeof(version));
+ }
+ { // the 2nd field, tensor description
+ // int32_t size
+ // void* protobuf message
+ framework::TensorDesc desc;
+ desc.set_data_type(framework::ToDataType(tensor.type()));
+ auto dims = framework::vectorize(tensor.dims());
+ auto *pb_dims = desc.mutable_dims();
+ pb_dims->Resize(static_cast(dims.size()), 0);
+ std::copy(dims.begin(), dims.end(), pb_dims->begin());
+ int32_t size = desc.ByteSize();
+ os.write(reinterpret_cast(&size), sizeof(size));
+ auto out = desc.SerializeAsString();
+ os.write(out.data(), size);
+ }
+ { // the 3rd field, tensor data
+ uint64_t size = tensor.memory_size();
+ auto *data_ptr = tensor.data();
+ PADDLE_ENFORCE(size < std::numeric_limits::max(),
+ "Index overflow when writing tensor");
+ if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+ constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
+ std::unique_ptr buf(new char[kBufSize]);
+ auto &gpu_dev_ctx =
+ static_cast(dev_ctx);
+ platform::CPUPlace cpu;
+ uintptr_t data = reinterpret_cast(data_ptr);
+ while (size != 0) {
+ size_t size_to_write = std::min(kBufSize, static_cast(size));
+ memory::Copy(cpu, buf.get(),
+ boost::get(tensor.place()),
+ reinterpret_cast(data), size_to_write,
+ gpu_dev_ctx.stream());
+ gpu_dev_ctx.Wait();
+ os.write(buf.get(), size_to_write);
+ data += size_to_write;
+ size -= size_to_write;
+ }
+#else
+ PADDLE_THROW("Unexpected branch");
+#endif
+ } else {
+ os.write(static_cast(data_ptr),
+ static_cast(size));
+ }
+ }
+ { // the 4th field, lod information
+ // uint64_t lod_level
+ // uint64_t lod_level_1 size in byte.
+ // int* lod_level_1 data
+ // ...
+ auto lod = tensor.lod();
+ uint64_t size = lod.size();
+ os.write(reinterpret_cast(&size), sizeof(size));
+
+ for (auto &each : lod) {
+ size = each.size() * sizeof(framework::LoD::value_type::value_type);
+ os.write(reinterpret_cast(&size), sizeof(size));
+ os.write(reinterpret_cast(each.data()),
+ static_cast(size));
+ }
+ }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+ uint32_t version;
+ is.read(reinterpret_cast(&version), sizeof(version));
+ PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+ framework::TensorDesc desc;
+ { // int32_t size
+ // proto buffer
+ int32_t size;
+ is.read(reinterpret_cast(&size), sizeof(size));
+ std::unique_ptr buf(new char[size]);
+ is.read(reinterpret_cast(buf.get()), size);
+ PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+ "Cannot parse tensor desc");
+ }
+ { // read tensor
+ std::vector dims;
+ dims.reserve(static_cast(desc.dims().size()));
+ std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+ tensor->Resize(framework::make_ddim(dims));
+
+ void *buf;
+ platform::Place cpu = platform::CPUPlace();
+ switch (desc.data_type()) {
+ case framework::FP32:
+ buf = tensor->mutable_data(cpu);
+ break;
+ case framework::FP64:
+ buf = tensor->mutable_data(cpu);
+ break;
+ case framework::INT32:
+ buf = tensor->mutable_data(cpu);
+ break;
+ case framework::INT64:
+ buf = tensor->mutable_data(cpu);
+ break;
+ default:
+ PADDLE_THROW("DataType %d not supported", desc.data_type());
+ }
+ is.read(static_cast(buf), tensor->memory_size());
+ }
+ { // read lod
+ uint64_t lod_level;
+ is.read(reinterpret_cast(&lod_level), sizeof(lod_level));
+ auto &lod = *tensor->mutable_lod();
+ lod.resize(lod_level);
+ for (uint64_t i = 0; i < lod_level; ++i) {
+ uint64_t size;
+ is.read(reinterpret_cast(&size), sizeof(size));
+ std::vector tmp(size / sizeof(size_t));
+ is.read(reinterpret_cast(tmp.data()),
+ static_cast(size));
+ lod[i] = tmp;
+ }
+ }
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 21bdfca1111f16d5b8ea71be004ddb8da12fd03c..9411c96aea4c10ebf921cc3e3b442769c8acbefa 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -189,5 +189,14 @@ std::pair> GetSubLoDAndAbsoluteOffset(
void AppendLoD(LoD* lod, const LoD& lod_length);
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+ const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be26b9ba88c279036f73b0a0baaff164755fe067
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include
+#include
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ /* Initialize the basic parent class */
+ Layer::init(layerMap, parameterMap);
+
+ factorSize_ = config_.factor_size();
+
+ /* initialize the latentVectors_ */
+ CHECK_EQ(inputLayers_.size(), 1UL);
+ size_t inputSize = inputLayers_[0]->getSize();
+ CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+ latentVectors_ = std::unique_ptr(
+ new Weight(inputSize, factorSize_, parameters_[0]));
+
+ return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+ Layer::forward(passType);
+
+ const MatrixPtr& inputV = getInputValue(0);
+
+ size_t batchSize = inputV->getHeight();
+ size_t outputSize = getSize();
+ size_t inputSize = inputLayers_[0]->getSize();
+ reserveOutput(batchSize, outputSize);
+
+ MatrixPtr outV = getOutputValue();
+
+ Matrix::resizeOrCreate(
+ latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+ Matrix::resizeOrCreate(
+ inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+ Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+ REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+ inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+ inputMulFactor_->square2(*tmpOut_);
+ outV->sumRows(*tmpOut_, 0.5, 0);
+
+ if (dynamic_cast(inputV.get())) {
+ Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+ inputV->getHeight(),
+ inputV->getWidth(),
+ inputV->getElementCnt(),
+ inputV->getValueType());
+ inputSquare_->copyFrom(*inputV);
+ (dynamic_cast(inputSquare_.get()))->square2();
+ } else {
+ Matrix::resizeOrCreate(
+ inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+ inputV->square2(*inputSquare_);
+ }
+ latentVectors_->getW()->square2(*latentVectorsSquare_);
+ tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+ outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+ /* activation */ {
+ REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+ forwardActivation();
+ }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+ /* Do derivation */ { backwardActivation(); }
+
+ const MatrixPtr& inputV = getInputValue(0);
+ const MatrixPtr& oGrad = getOutputGrad();
+
+ Matrix::resizeOrCreate(
+ tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+ MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+ latentVectors_->getW()->getHeight(),
+ 1,
+ false,
+ useGpu_);
+
+ /* Calculate the gradients of the latentVectors_ matrix */
+ if (latentVectors_->getWGrad()) {
+ if (dynamic_cast(inputV.get())) {
+ Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+ inputV->getHeight(),
+ inputV->getWidth(),
+ inputV->getElementCnt());
+
+ CpuSparseMatrix* sparseInputV =
+ dynamic_cast(inputV.get());
+ CpuSparseMatrix* sparseInputSquare =
+ dynamic_cast(inputSquare_.get());
+ CpuSparseMatrix* sparseTmpInput =
+ dynamic_cast(tmpInput_.get());
+ sparseTmpInput->copyFrom(*sparseInputV);
+
+ sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+ latentVectors_->getWGrad()->mul(
+ *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+ sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+ Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+ negOnes_->zeroMem();
+ negOnes_->add(-1);
+ tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+ } else {
+ Matrix::resizeOrCreate(
+ tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+ tmpInput_->rowScale(0, *inputV, *oGrad);
+ latentVectors_->getWGrad()->mul(
+ *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+ tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+ tmpSum_->sumCols(*tmpInput_, -1, 0);
+ }
+
+ latentVectors_->getWGrad()->addRowScale(
+ 0, *latentVectors_->getW(), *tmpSumTrans);
+
+ /* Increasing the number of gradient */
+ latentVectors_->getParameterPtr()->incUpdate(callback);
+ }
+
+ /* Calculate the input layers gradient */
+ MatrixPtr inGrad = getInputGrad(0);
+ if (inGrad != NULL) {
+ inGrad->mul(
+ *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+ tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+ inGrad->addColScale(0, *inputV, *tmpSum_);
+ inGrad->rowScale(0, *inGrad, *oGrad);
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..df20a49934d5dd444f127842c8fdb7c77f4ebeb1
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ * y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ * Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+ // The latent vectors, shape: (size, factorSize_)
+ // Each row of the latentVectors_ matrix is the latent vector
+ // corresponding to one input feature dimension
+ std::unique_ptr latentVectors_;
+ // The hyperparameter that defines the dimensionality of the factorization
+ size_t factorSize_;
+
+private:
+ // Store the square values of the letent vectors matrix
+ MatrixPtr latentVectorsSquare_;
+ // Store the square values of input matrix
+ MatrixPtr inputSquare_;
+ // The result of input matrix * latent vector matrix that will be used in
+ // both forward and backward step
+ MatrixPtr inputMulFactor_;
+ // Store temporary calculation result
+ MatrixPtr tmpOut_;
+ MatrixPtr tmpSum_;
+ MatrixPtr tmpInput_;
+ // Negative identity matrix
+ MatrixPtr negOnes_;
+
+public:
+ explicit FactorizationMachineLayer(const LayerConfig& config)
+ : Layer(config) {}
+ ~FactorizationMachineLayer() {}
+
+ bool init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) override;
+
+ void forward(PassType passType) override;
+ void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e5b347aa44ce1951c043d5813a5b3ff..236f8096bdb6e024cf3c9c73eba422616a777a23 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
batchSize,
codeLength_,
/* trans */ false,
- useGpu(deviceId_));
+ false);
Matrix::resizeOrCreate(preOutput_.grad,
batchSize,
codeLength_,
/* trans */ false,
- useGpu(deviceId_));
-
+ false);
IVectorPtr label = getInput(*getLabelLayer()).ids;
-
preOutput_.value->zeroMem();
+ if (useGpu_) {
+ Matrix::resizeOrCreate(cpuOutput_,
+ output_.value->getHeight(),
+ output_.value->getWidth(),
+ /* trans */ false,
+ false);
+ IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+ cpuLabel_->copyFrom(*label);
+ cpuOutput_->copyFrom(*output_.value);
+ } else {
+ cpuOutput_ = output_.value;
+ cpuLabel_ = label;
+ }
/* add the bias-vector */
if (biases_.get() != NULL) {
- preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+ if (useGpu_) {
+ Matrix::resizeOrCreate(cpuBias_,
+ 1,
+ numClasses_ - 1,
+ /* trans */ false,
+ false);
+ cpuBias_->copyFrom(*biases_->getW());
+ } else {
+ cpuBias_ = biases_->getW();
+ }
+ preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
}
for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
MatrixPtr input = getInputValue(i);
+ if (useGpu_) {
+ Matrix::resizeOrCreate(cpuInput_,
+ input->getHeight(),
+ input->getWidth(),
+ /* trans */ false,
+ false);
+ Matrix::resizeOrCreate(cpuWeight_,
+ weights_[i]->getW()->getHeight(),
+ weights_[i]->getW()->getWidth(),
+ /* trans */ false,
+ false);
+ cpuInput_->copyFrom(*input);
+ cpuWeight_->copyFrom(*weights_[i]->getW());
+ } else {
+ cpuInput_ = input;
+ cpuWeight_ = weights_[i]->getW();
+ }
preOutput_.value->mulByBitCode(
- numClasses_, *label, *weights_[i]->getW(), *input);
+ numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
}
// keep consistent with the clipping in the following softrelu
preOutput_.value->clip(-40.0, 40.0);
preOutput_.value->sumByBitCode(numClasses_,
- *label,
- *output_.value,
+ *cpuLabel_,
+ *cpuOutput_,
-1); // scaleSum
preOutput_.value->softrelu(*preOutput_.value);
- MatrixPtr sum =
- Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+ MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
preOutput_.value->rowSum(*sum);
- output_.value->add(*sum);
+ cpuOutput_->add(*sum);
+ if (useGpu_) {
+ output_.value->copyFrom(*cpuOutput_);
+ } else {
+ output_.value = cpuOutput_;
+ }
}
void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
IVectorPtr label = getInput(*getLabelLayer()).ids;
+ if (useGpu_) {
+ IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+ cpuLabel_->copyFrom(*label);
+ } else {
+ cpuLabel_ = label;
+ }
preOutput_.grad->one();
preOutput_.grad->softreluDerivative(*preOutput_.value);
- preOutput_.grad->subByBitCode(numClasses_, *label);
+ preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
if (biases_ && biases_->getWGrad()) {
- preOutput_.grad->addByBitCodeBackward(
- numClasses_, *label, *biases_->getWGrad());
-
+ MatrixPtr biases_grad = biases_->getWGrad();
+ if (useGpu_) {
+ Matrix::resizeOrCreate(cpuBias_,
+ 1,
+ numClasses_ - 1,
+ /* trans */ false,
+ false);
+ cpuBias_->copyFrom(*biases_grad);
+ } else {
+ cpuBias_ = biases_grad;
+ }
+ preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+ if (useGpu_) {
+ biases_grad->copyFrom(*cpuBias_);
+ } else {
+ biases_grad = cpuBias_;
+ }
/* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback);
}
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the W-gradient for the current layer */
MatrixPtr input = getInputValue(i);
if (weights_[i]->getWGrad()) {
+ MatrixPtr weights_grad = weights_[i]->getWGrad();
+ if (useGpu_) {
+ Matrix::resizeOrCreate(cpuInput_,
+ input->getHeight(),
+ input->getWidth(),
+ /* trans */ false,
+ false);
+ Matrix::resizeOrCreate(cpuWeightGrad_,
+ weights_grad->getHeight(),
+ weights_grad->getWidth(),
+ /* trans */ false,
+ false);
+ cpuInput_->copyFrom(*input);
+ cpuWeightGrad_->copyFrom(*weights_grad);
+ } else {
+ cpuInput_ = input;
+ cpuWeightGrad_ = weights_grad;
+ }
preOutput_.grad->mulByBitCodeBackwardWeight(
- numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+ numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+ if (useGpu_) {
+ weights_grad->copyFrom(*cpuWeightGrad_);
+ } else {
+ weights_grad = cpuWeightGrad_;
+ }
/* Increasing the number of gradient */
weights_[i]->getParameterPtr()->incUpdate(callback);
}
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the input layers error */
MatrixPtr inputGrad = getInputGrad(i);
if (inputGrad) {
+ if (useGpu_) {
+ Matrix::resizeOrCreate(cpuInputGrad_,
+ inputGrad->getHeight(),
+ inputGrad->getWidth(),
+ /* trans */ false,
+ false);
+ Matrix::resizeOrCreate(cpuWeight_,
+ weights_[i]->getW()->getHeight(),
+ weights_[i]->getW()->getWidth(),
+ /* trans */ false,
+ false);
+ cpuInputGrad_->copyFrom(*inputGrad);
+ cpuWeight_->copyFrom(*weights_[i]->getW());
+ } else {
+ cpuInputGrad_ = inputGrad;
+ cpuWeight_ = weights_[i]->getW();
+ }
preOutput_.grad->mulByBitCodeBackwardError(
- numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+ numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+ if (useGpu_) {
+ inputGrad->copyFrom(*cpuInputGrad_);
+ } else {
+ inputGrad = cpuInputGrad_;
+ }
}
}
}
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b1674680da962d6e51caa56b46279b70de..7f896e61ca26e3e22b99b65b1285384a121f7f02 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
int codeLength_;
/// temporary result of output_
Argument preOutput_;
+
+ /// The temporary variables in CPU memory.
+ MatrixPtr cpuWeight_;
+ MatrixPtr cpuWeightGrad_;
+ MatrixPtr cpuInput_;
+ MatrixPtr cpuInputGrad_;
+ MatrixPtr cpuBias_;
+ MatrixPtr cpuOutput_;
+ IVectorPtr cpuLabel_;
};
} // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index cacf10692942f5eca2f6c498183f4acc00768460..c5359f272b4bed4d4d2483bf19d7ae482b0d33dd 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
- // Not support GPU now
- testLayerGrad(config,
- "hsigmoid",
- 100,
- /* trans */ false, /* useGpu */
- false);
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config,
+ "hsigmoid",
+ 100,
+ /* trans */ false,
+ /* useGpu */ useGpu);
+ }
}
TEST(Layer, multi_cross) {
@@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) {
}
}
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+ const int FACTOR_SIZE = 10;
+ TestConfig config;
+ config.layerConfig.set_type("factorization_machine");
+ config.layerConfig.set_factor_size(FACTOR_SIZE);
+ config.layerConfig.set_size(1);
+ config.biasSize = 0;
+ config.inputDefs.push_back({type, "layer_0", 128, 1280});
+ config.layerConfig.add_inputs();
+ testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+ for (auto useGpu : {false, true}) {
+ testFactorizationMachineLayer(INPUT_DATA, useGpu);
+ }
+ testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03bb1d6e2bdf86d8c56a8157938fb832..dc6979cf5a5229fb09866189f28217889d58c2d0 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
os << ";";
}
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+ CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+ CHECK_EQ(height_, b.getHeight());
+ CHECK_EQ(width_, b.getWidth());
+ real* A = getValue();
+ real* B = b.getValue();
+ if (b.getValueType() == FLOAT_VALUE) {
+ for (size_t i = 0; i < height_; i++) {
+ size_t start = getRowStartIdx(i);
+ size_t end = getRowStartIdx(i + 1);
+ CHECK_EQ(start, b.getRowStartIdx(i));
+ CHECK_EQ(end, b.getRowStartIdx(i + 1));
+ for (size_t j = start; j < end; j++) {
+ A[j] = B[j] * c.getElement(i, cCol);
+ }
+ }
+ } else if (b.getValueType() == NO_VALUE) {
+ for (size_t i = 0; i < height_; i++) {
+ size_t start = getRowStartIdx(i);
+ size_t end = getRowStartIdx(i + 1);
+ CHECK_EQ(start, b.getRowStartIdx(i));
+ CHECK_EQ(end, b.getRowStartIdx(i + 1));
+ for (size_t j = start; j < end; j++) {
+ A[j] = c.getElement(i, cCol);
+ }
+ }
+ }
+}
+
void CpuSparseMatrix::randomizeUniform() {
CHECK_LE(elementCnt_, height_ * width_);
if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index aad1348353d558abca72ed0fa5cf943237e3ac78..522b436a2a69179d3f4f17c919d5ba024102db7b 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -239,6 +239,15 @@ public:
const unsigned int* cols,
const real* values);
+ /**
+ * @brief this_row = b_row * c_row[cCol]
+ *
+ * @param[in] cCol the column of matrix c used to scale each row of b
+ * @param[in] b CpuSparseMatrix
+ * @param[in] c Matrix
+ */
+ void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
void randomizeUniform();
void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a4c4374cf2f8b4b034d05e3a4c2221300a944214..7e5d4fd640f4399d1a217d1a0be76b3da457c0cc 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -205,8 +205,24 @@ set(DEPS_OPS
tensor_array_read_write_op
gru_op
adagrad_op
- sgd_op)
+ sgd_op
+ save_op
+ load_op
+ send_op
+ recv_op)
+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+ send_op.cc
+ PROPERTIES
+ COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+ recv_op.cc
+ PROPERTIES
+ COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy)
@@ -235,6 +251,10 @@ op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
@@ -242,6 +262,8 @@ endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+
+
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
@@ -251,3 +273,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f884e6efa917ce3f8554dce0e248f2b29273e3f3..ac97bd83ab7e7838871586cfe5acb832084b6cec 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
const auto x_dims = ctx->GetInputDim("X");
const TensorFormat tensor_format =
StringToTensorFormat(ctx->Attrs().Get("tensor_format"));
+
+ PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+ "Input X must have 2 to 5 dimensions.");
+
const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
- PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
- "Input X must have 3 to 5 dimensions.");
-
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -146,8 +147,8 @@ class BatchNormKernel : public framework::OpKernel {
const auto *x = ctx.Input("X");
const auto &x_dims = x->dims();
- PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
- "The Input dim size should be between 3 and 5");
+ PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+ "The Input dim size should be between 2 and 5");
const int N = x_dims[0];
const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -339,8 +340,8 @@ class BatchNormGradKernel
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x->dims();
- PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
- "The Input dim size should be between 3 and 5");
+ PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+ "The Input dim size should be between 2 and 5");
const int N = x_dims[0];
const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1]
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8d7ced93f94bb0e5bb4df9e43b0ac7b..7b2f3187007fa2491afa75de1cde1910c6ce9bb8 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
const TensorFormat &tensor_format, int *N, int *C, int *H,
int *W, int *D) {
*N = dims[0];
- *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
- *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
- *W = dims.size() > 3
- ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
- : 1;
- *D = dims.size() > 4
- ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
- : 1;
+ if (dims.size() == 2) {
+ *C = dims[1];
+ *H = 1;
+ *W = 1;
+ *D = 1;
+ } else {
+ *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
+ *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+ *W = dims.size() > 3
+ ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+ : 1;
+ *D = dims.size() > 4
+ ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+ : 1;
+ }
}
template
@@ -56,8 +63,8 @@ class BatchNormKernel : public framework::OpKernel {
// NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input("X");
const auto &x_dims = x->dims();
- PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
- "The Input dim size should be between 3 and 5");
+ PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+ "The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
@@ -180,8 +187,8 @@ class BatchNormGradKernel
const auto &x_dims = x->dims();
- PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
- "The Input dim size should be between 3 and 5");
+ PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+ "The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index a9763d424801cfced5fe4c4718a335a24b81cfdc..3f97dc7ee0a61944a8a57314b5ec7f33df619bf3 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel {
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor(paddings, strides, dilations);
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel {
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor(paddings, strides, dilations);
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 7a36a9b21aa6a1b415ac5a232e65eda8051c87f8..462e6d9cbcbe61d9911efe8beff4446620e1e932 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0});
AddAttr(
"groups",
- "(int default:1), the group size of convolution operator. "
+ "(int default:1), the groups number of the convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters "
@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
Convolution Operator.
The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups, dilations parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
-These two elements represent height and width, respectively.
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
- Input shape: (N, C_in, H_in, W_in)
- Filter shape: (C_out, C_in, H_f, W_f)
+ Input shape: $(N, C_{in}, H_{in}, W_{in})$
+ Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output:
- Output shape: (N, C_out, H_out, W_out)
- where
- H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
- W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
+ Output shape: $(N, C_{out}, H_{out}, W_{out})$
+ Where
+$$
+ H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+ W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
)DOC");
}
@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0, 0});
AddAttr(
"groups",
- "(int default:1), the group size of convolution operator. "
+ "(int default:1), the groups number of the convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters "
@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
AddAttr>("dilations",
"(vector default:{1, 1, 1}), the "
"dilations(d_dilation, h_dilation, w_dilation) of "
- "convolution operator. Currently, conv3d doesn't "
- "support dilation.")
+ "convolution operator.")
.SetDefault({1, 1, 1});
AddComment(R"DOC(
Convolution3D Operator.
The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+Input(Input) and output(Output) are in NCDHW format, where N is batch
size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and width, respectively.
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
- Input shape: (N, C_in, D_in, H_in, W_in)
- Filter shape: (C_out, C_in, D_f, H_f, W_f)
+ Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+ Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
Output:
- Output shape: (N, C_out, D_out, H_out, W_out)
- where
- D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
- H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
- W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
+ Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+ Where
+ $$
+ D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+ H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+ W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+ $$
)DOC");
}
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 3e55ef036a7fb976117054574d1347fa943acd55..678b192dea78fc6b4a6b54c4bb09a55dfb8f9c38 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
- "ConvTransposeOp paddings dimension and Conv strides "
+ "ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"In ConvTransposeOp, The input channel should be the same "
@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H is the height of the feature, and "
"W is the width of the feature.");
- AddInput("Filter",
- "(Tensor) The filter tensor of convolution transpose operator. "
- "The format of the filter tensor is CMHW, where C is the number of "
- "output image channels, M is the number of input image channels, "
- "H is the height of the filter, and W is the width of the filter. "
- "We enforce groups number == 1 and padding == 0 in "
- "the convolution transpose scenario.");
+ AddInput(
+ "Filter",
+ "(Tensor) The filter tensor of convolution transpose operator. "
+ "The format of the filter tensor is MCHW, where M is the number of "
+ "input feature channels, C is the number of "
+ "output feature channels,"
+ "H is the height of the filter, and W is the width of the filter. "
+ "We enforce groups number == 1 in the convolution transpose scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW.");
AddAttr>(
"strides",
- "(vector defalut:{1, 1}), the strides(h_stride, w_stride) of "
+ "(vector default:{1, 1}), the strides(h_stride, w_stride) of "
"convolution transpose operator.")
.SetDefault({1, 1});
AddAttr>(
"paddings",
- "(vector defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+ "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
"transpose operator.")
.SetDefault({0, 0});
AddComment(R"DOC(
@@ -88,21 +89,26 @@ Convolution2D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and
-W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
The input(X) size and output(Out) size may be different.
+
Example:
Input:
- Input shape: (N, C_in, H_in, W_in)
- Filter shape: (C_in, C_out, H_f, W_f)
+ Input shape: $(N, C_{in}, H_{in}, W_{in})$
+ Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
Output:
- Output shape: (N, C_out, H_out, W_out)
- where
- H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
- W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+ Output shape: $(N, C_{out}, H_{out}, W_{out})$
+ Where
+ $$
+ H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+ W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+ $$
)DOC");
}
@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
- "The format of the filter tensor is CMDHW, where C is the number of "
- "output image channels, M is the number of input image channels, D "
+ "The format of the filter tensor is MCDHW, where M is the number of "
+ "input feature channels, C is the number of "
+ "output feature channels, D "
"is the depth of the filter, H is the height of the filter, and "
"W is the width of the filter."
"We enforce groups number == 1 and padding == 0 in "
@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature.");
AddAttr>("strides",
- "(vector defalut:{1, 1, 1}), the "
+ "(vector default:{1, 1, 1}), the "
"strides{d_stride, h_stride, w_stride} of "
"convolution transpose operator.")
.SetDefault({1, 1, 1});
AddAttr>("paddings",
- "(vector defalut:{0, 0, 0}), paddings(d_pad, "
+ "(vector default:{0, 0, 0}), paddings(d_pad, "
"h_pad, w_pad) of convolution transpose operator.")
.SetDefault({0, 0, 0});
AddComment(R"DOC(
@@ -144,23 +151,28 @@ Convolution3D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
-size, C is the number of channels, D is the depth of the feature,
-H is the height of the feature, and W is the width of the feature.
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
-Example:
+
+Example:
Input:
- Input shape: (N, C_in, D_in, H_in, W_in)
- Filter shape: (C_in, C_out, D_f, H_f, W_f)
+ Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+ Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
Output:
- Output shape: (N, C_out, D_out, H_out, W_out)
- where
- D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
- H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
- W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+ Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+ Where
+ $$
+ D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
+ H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+ W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+ $$
)DOC");
}
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 0fc0735788c499c2d520c0cc689e1ce07ba67ce8..1cacb770e6af3ad3c99ab81c5598ffcd228f59b2 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel {
std::vector strides = context.Attr>("strides");
std::vector paddings = context.Attr>("paddings");
- // TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose.
const int batch_size = static_cast(input->dims()[0]);
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f6bdc63cc2cfae526fe911ee4d989675452d5c5d
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89dc5045221156eed7aa9411bc96ad86f91136d2
--- /dev/null
+++ b/paddle/operators/detail/recv_impl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+Status SendRecvServerImpl::SendVariable(ServerContext *context,
+ const VariableMessage *in_var,
+ VariableMessage *out_var) {
+ framework::LoDTensor t;
+ // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+ std::istringstream iss(in_var->serialized());
+ framework::DeserializeFromStream(iss, &t);
+ lodtensor_queue_.Push(std::move(t));
+ // Block util the sub graph is done.
+ t = lodtensor_return_queue_.Pop();
+ std::ostringstream oss;
+ // FIXME(typhoonzero): get context from op.
+ framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+ std::string *varname = out_var->mutable_varname();
+ *varname = in_var->varname();
+ std::string *serialized = out_var->mutable_serialized();
+ *serialized = oss.str();
+
+ return Status::OK;
+}
+
+} // namespace detail
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da1ddf75d2afb85670c5ea0c9884376415f28208
--- /dev/null
+++ b/paddle/operators/detail/send_impl.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::SendVariable(const framework::Scope& scope,
+ const std::string& inname,
+ const std::string& outname) {
+ ClientContext context;
+ VariableMessage msg, out_msg;
+ // FIXME(typhoonzero): pass device context to here.
+ auto ctx = platform::CPUDeviceContext();
+ auto* var = scope.FindVar(inname);
+ PADDLE_ENFORCE(var);
+ // TODO(typhoonzero): support SelectedRows
+ PADDLE_ENFORCE(var->IsType(),
+ "Only support LoDTensor, %s has wrong type", inname);
+ const framework::LoDTensor& tensor = var->Get();
+ std::ostringstream oss;
+ framework::SerializeToStream(oss, tensor, ctx);
+ msg.set_varname(inname);
+ msg.set_serialized(oss.str());
+ Status status = stub_->SendVariable(&context, msg, &out_msg);
+ if (!status.ok()) {
+ return false;
+ }
+ std::istringstream iss(out_msg.serialized());
+ framework::LoDTensor ret_tensor;
+ framework::DeserializeFromStream(iss, &ret_tensor);
+ auto* outvar = scope.FindVar(outname);
+ framework::LoDTensor* out_tensor = outvar->GetMutable();
+ // FIXME(typhoonzero): do not copy.
+ framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
+ return true;
+}
+
+} // namespace detail
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..962c7d59819dede022474aec4a2d7f538d28c688
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+syntax = "proto3";
+
+package sendrecv;
+
+service SendRecvService {
+ // For parameter server round-robin like hashing, do not split tensors.
+ // Send and recv only one tensor
+ rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// Tensor
+// LoDTensor
+// SelectedRows
+message VariableMessage {
+ string varname = 1;
+ bytes serialized = 2;
+}
+
+message VoidMessage {}
\ No newline at end of file
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9a5340a8636db7b5d6ec7b21368632d3916b4aa
--- /dev/null
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+// #include
+// #include
+// #include
+// #include
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include
+
+using grpc::Channel;
+using grpc::Server;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerBuilder;
+
+using grpc::ClientContext;
+using grpc::ClientReader;
+using grpc::ClientReaderWriter;
+using grpc::ClientWriter;
+using grpc::Status;
+using sendrecv::SendRecvService;
+using sendrecv::VariableMessage;
+using sendrecv::VoidMessage;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class SendRecvServerImpl final : public SendRecvService::Service {
+ public:
+ explicit SendRecvServerImpl() {}
+
+ Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+ VariableMessage *out_var) override;
+
+ const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+
+ void Push(const framework::LoDTensor &tensor) {
+ this->lodtensor_return_queue_.Push(tensor);
+ }
+
+ private:
+ SimpleBlockQueue lodtensor_queue_;
+ SimpleBlockQueue lodtensor_return_queue_;
+ SimpleBlockQueue selected_rows_queue_;
+ SimpleBlockQueue selected_rows_return_queue_;
+};
+
+// RPCClient is a class to send tensors to pserver sub-network
+// using different hashing methods.
+class RPCClient {
+ public:
+ RPCClient(std::shared_ptr channel)
+ : stub_(SendRecvService::NewStub(channel)) {}
+
+ bool SendVariable(const framework::Scope &scope, const std::string &inname,
+ const std::string &outname);
+
+ private:
+ std::unique_ptr stub_;
+};
+
+} // namespace detail
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..44899217579532af2c1d2e6074ec0e08231e7b86
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include
+#include
+#include
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template
+class SimpleBlockQueue {
+ private:
+ std::mutex mutex_;
+ std::condition_variable condition_;
+ std::deque queue_;
+
+ public:
+ void Push(T const& value) {
+ {
+ std::unique_lock lock(this->mutex_);
+ queue_.push_front(value);
+ }
+ this->condition_.notify_one();
+ }
+
+ T Pop() {
+ std::unique_lock lock(this->mutex_);
+ this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+ T rc(std::move(this->queue_.back()));
+ this->queue_.pop_back();
+ return rc;
+ }
+};
+
+} // namespace detail
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 1b18368e0e16365682520b62a7f6adab0cbb527f..564489d3a98b59e3e527be5613a73d23d6dbbf31 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel {
int frame_size = hidden_dims[1];
math::hl_gru_value gru_value;
- gru_value.gateWeight = const_cast(weight_data);
- gru_value.stateWeight =
+ gru_value.gate_weight = const_cast(weight_data);
+ gru_value.state_weight =
const_cast(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data();
@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel {
// to reorder.
ReorderInitState(context.device_context(), *h0, order,
&ordered_h0, true);
- gru_value.prevOutValue = ordered_h0.data();
+ gru_value.prev_out_value = ordered_h0.data();
} else {
- gru_value.prevOutValue = nullptr;
+ gru_value.prev_out_value = nullptr;
}
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel {
Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
- gru_value.outputValue = hidden_t.data();
- gru_value.gateValue = gate_t.data();
- gru_value.resetOutputValue = reset_hidden_prev_t.data();
+ gru_value.output_value = hidden_t.data();
+ gru_value.gate_value = gate_t.data();
+ gru_value.reset_output_value = reset_hidden_prev_t.data();
math::GRUUnitFunctor::compute(
dev_ctx, gru_value, frame_size, cur_batch_size,
math::ActiveType(context.Attr("activation")),
math::ActiveType(context.Attr("gate_activation")));
- gru_value.prevOutValue = gru_value.outputValue;
+ gru_value.prev_out_value = gru_value.output_value;
}
math::Batch2LoDTensorFunctor to_seq;
@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel {
to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
math::hl_gru_value gru_value;
- gru_value.gateWeight = const_cast(weight_data);
- gru_value.stateWeight =
+ gru_value.gate_weight = const_cast(weight_data);
+ gru_value.state_weight =
const_cast(weight_data + 2 * frame_size * frame_size);
math::hl_gru_grad gru_grad;
if (weight_grad) {
- gru_grad.gateWeightGrad =
+ gru_grad.gate_weight_grad =
weight_grad->mutable_data(context.GetPlace());
zero(dev_ctx, weight_grad, static_cast(0.0));
- gru_grad.stateWeightGrad =
+ gru_grad.state_weight_grad =
weight_grad->data() + 2 * frame_size * frame_size;
} else {
- gru_grad.gateWeightGrad = nullptr;
- gru_grad.stateWeightGrad = nullptr;
+ gru_grad.gate_weight_grad = nullptr;
+ gru_grad.state_weight_grad = nullptr;
}
auto batch_starts = batch_hidden_grad.lod()[0];
@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel {
int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend);
- gru_value.gateValue = gate_t.data();
+ gru_value.gate_value = gate_t.data();
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
- gru_value.resetOutputValue = reset_hidden_prev_t.data();
+ gru_value.reset_output_value = reset_hidden_prev_t.data();
Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
- gru_grad.outputGrad = hidden_grad_t.data();
+ gru_grad.output_grad = hidden_grad_t.data();
Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
- gru_grad.gateGrad = gate_grad_t.data();
+ gru_grad.gate_grad = gate_grad_t.data();
Tensor reset_hidden_prev_grad_t =
batch_reset_hidden_prev_grad.Slice(bstart, bend);
- gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data();
+ gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data();
if (n == 0) {
- gru_value.prevOutValue = h0 ? ordered_h0.data() : nullptr;
- gru_grad.prevOutGrad =
+ gru_value.prev_out_value = h0 ? ordered_h0.data() : nullptr;
+ gru_grad.prev_out_grad =
h0 && h0_grad ? ordered_h0_grad.data() : nullptr;
} else {
int bstart_pre = static_cast(batch_starts[n - 1]);
Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
- gru_value.prevOutValue = hidden_prev_t.data();
+ gru_value.prev_out_value = hidden_prev_t.data();
Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
- gru_grad.prevOutGrad = hidden_prev_grad_t.data();
+ gru_grad.prev_out_grad = hidden_prev_grad_t.data();
}
math::GRUUnitGradFunctor::compute(
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index b0838eed1611c1d51e57fc2300606f753982dc89..4e58b84430f2a8697bbbc1acf971fd063120f563 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
out_var_name);
auto *tensor = out_var->GetMutable();
-
- uint32_t version;
- fin.read(reinterpret_cast(&version), sizeof(version));
- PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
- framework::TensorDesc desc;
- { // int32_t size
- // proto buffer
- int32_t size;
- fin.read(reinterpret_cast(&size), sizeof(size));
- std::unique_ptr buf(new char[size]);
- fin.read(reinterpret_cast(buf.get()), size);
- PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
- "Cannot parse tensor desc");
- }
- { // read tensor
- std::vector dims;
- dims.reserve(static_cast(desc.dims().size()));
- std::copy(desc.dims().begin(), desc.dims().end(),
- std::back_inserter(dims));
- tensor->Resize(framework::make_ddim(dims));
-
- void *buf;
- platform::Place cpu = platform::CPUPlace();
- switch (desc.data_type()) {
- case framework::FP32:
- buf = tensor->mutable_data(cpu);
- break;
- case framework::FP64:
- buf = tensor->mutable_data(cpu);
- break;
- case framework::INT32:
- buf = tensor->mutable_data(cpu);
- break;
- case framework::INT64:
- buf = tensor->mutable_data(cpu);
- break;
- default:
- PADDLE_THROW("DataType %d not supported", desc.data_type());
- }
- fin.read(static_cast(buf), tensor->memory_size());
- }
- { // read lod
- uint64_t lod_level;
- fin.read(reinterpret_cast(&lod_level), sizeof(lod_level));
- auto &lod = *tensor->mutable_lod();
- lod.resize(lod_level);
- for (uint64_t i = 0; i < lod_level; ++i) {
- uint64_t size;
- fin.read(reinterpret_cast(&size), sizeof(size));
- std::vector tmp(size / sizeof(size_t));
- fin.read(reinterpret_cast(tmp.data()),
- static_cast(size));
- lod[i] = tmp;
- }
- }
+ framework::DeserializeFromStream(fin, tensor);
auto place = dev_ctx.GetPlace();
if (platform::is_gpu_place(place)) {
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..257e5c8a49e935dcbdc33e5060118ef1804fa8d7
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+ "Input(Predicted) must be initialized.");
+ PADDLE_ENFORCE(ctx->HasInput("Labels"),
+ "Input(Labels) must be initialized.");
+
+ auto pred_dims = ctx->GetInputDim("Predicted");
+ auto label_dims = ctx->GetInputDim("Labels");
+
+ PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+ PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+ "The rank of Input(Predicted) must be 2 and the shape is "
+ "[batch_size, 1].");
+ PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+ "Each row of Input(Predicted) contains a real value, "
+ "so the 2nd dimension of Input(X) must be 1.");
+
+ ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+ ctx->ShareLoD("Predicted", "Loss");
+ }
+};
+
+template
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ LogLossOpMaker(framework::OpProto* proto,
+ framework::OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("Predicted",
+ "The input value (Predicted) of Log loss op."
+ "Predicted is a 2-D tensor with shape [batch_size, 1].");
+ AddInput("Labels",
+ "The target value (Labels) of Log loss op."
+ "Labels is a 2-D tensor with shape [batch_size, 1].");
+ AddOutput("Loss",
+ "The output tensor with shape [batch_size, 1] "
+ "which represents the log loss.");
+ AddAttr("epsilon", "Epsilon in log loss.");
+ AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+ (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+ }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+ "Input(Predicted) should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput("Labels"),
+ "Input(Labels) should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+ "Input(Loss@GRAD) should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+ "Output(Predicted@GRAD) should not be null.");
+
+ auto pred_dims = ctx->GetInputDim("Predicted");
+ auto label_dims = ctx->GetInputDim("Labels");
+ auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+ PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+ auto pred_grad_name = framework::GradVarName("Predicted");
+ ctx->SetOutputDim(pred_grad_name, pred_dims);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad,
+ ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+ ops::LogLossKernel);
+REGISTER_OP_CPU_KERNEL(
+ log_loss_grad, ops::LogLossGradKernel);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6c189ef3412d7a56205502c7913e93218a03b929
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+ ops::LogLossKernel);
+REGISTER_OP_GPU_KERNEL(
+ log_loss_grad, ops::LogLossGradKernel);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..73404fce9157fa750a51451fa93646bc4059481a
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template
+using EigenVector = framework::EigenVector;
+
+template
+class LogLossKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& ctx) const override {
+ auto* loss_out = ctx.Output("Loss");
+
+ loss_out->mutable_data(ctx.GetPlace());
+
+ auto epsilon = static_cast(ctx.Attr("epsilon"));
+
+ auto prediction = EigenVector::Flatten(*ctx.Input("Predicted"));
+ auto label = EigenVector::Flatten(*ctx.Input("Labels"));
+
+ auto loss = EigenVector::Flatten(*loss_out);
+ auto place = ctx.GetEigenDevice();
+
+ loss.device(place) = (-(label * (prediction + epsilon).log()) -
+ ((static_cast(1) - label) *
+ (static_cast(1) - prediction + epsilon).log()));
+ }
+};
+
+template
+class LogLossGradKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& ctx) const override {
+ auto epsilon = static_cast(ctx.Attr("epsilon"));
+
+ auto prediction = EigenVector::Flatten(*ctx.Input("Predicted"));
+ auto label = EigenVector::Flatten(*ctx.Input("Labels"));
+
+ auto* dloss = ctx.Input(framework::GradVarName("Loss"));
+ auto* dpred = ctx.Output(framework::GradVarName("Predicted"));
+
+ auto dl = EigenVector::Flatten(*dloss);
+ auto place = ctx.GetEigenDevice();
+
+ if (dpred) {
+ dpred->mutable_data(ctx.GetPlace());
+ auto dx = framework::EigenVector::Flatten(*dpred);
+ dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+ ((static_cast(1) - label) /
+ (static_cast(1) - prediction + epsilon)));
+ }
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
index 51af140cf4d5e6581765bea00033fa53d383230d..4c67dec9cbeb48f400f79f5ed7ba3c939fa2540c 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -25,393 +25,397 @@ namespace detail {
#ifndef __NVCC__
template
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
- T *gateValue, T *resetOutputValue,
- T *prevOutputValue, int frameSize,
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+ T *gate_value, T *reset_output_value,
+ T *prev_output_value, int frame_size,
activation_mode_t active_gate) {
- T rValueUpdateGate;
- T rValueResetGate;
- T rValueResetOutput;
- T rPrevOut = 0;
- T *updateGate = gateValue;
- T *resetGate = gateValue + frameSize;
-
- for (int i = 0; i < frameSize; i++) {
- rValueUpdateGate = updateGate[i];
- rValueResetGate = resetGate[i];
- if (prevOutputValue) {
- rPrevOut = prevOutputValue[i];
+ T r_value_update_gate;
+ T r_value_reset_gate;
+ T r_value_reset_output;
+ T r_prev_out = 0;
+ T *update_gate = gate_value;
+ T *reset_gate = gate_value + frame_size;
+
+ for (int i = 0; i < frame_size; i++) {
+ r_value_update_gate = update_gate[i];
+ r_value_reset_gate = reset_gate[i];
+ if (prev_output_value) {
+ r_prev_out = prev_output_value[i];
}
- opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
- rValueResetOutput, active_gate);
+ op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+ r_value_reset_output, active_gate);
- updateGate[i] = rValueUpdateGate;
- resetGate[i] = rValueResetGate;
- resetOutputValue[i] = rValueResetOutput;
+ update_gate[i] = r_value_update_gate;
+ reset_gate[i] = r_value_reset_gate;
+ reset_output_value[i] = r_value_reset_output;
}
}
template
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
- T *gateValue, T *prevOutputValue,
- T *outputValue, int frameSize,
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+ T *gate_value, T *prev_output_value,
+ T *output_value, int frame_size,
activation_mode_t active_node) {
- T rValueUpdateGate;
- T rValueFrameState;
- T rPrevOut = 0;
- T rOutput;
- T *updateGate = gateValue;
- T *frameState = gateValue + frameSize * 2;
-
- for (int i = 0; i < frameSize; i++) {
- rValueUpdateGate = updateGate[i];
- rValueFrameState = frameState[i];
- if (prevOutputValue) {
- rPrevOut = prevOutputValue[i];
+ T r_value_update_gate;
+ T r_value_frame_state;
+ T r_prev_out = 0;
+ T r_output;
+ T *update_gate = gate_value;
+ T *frame_state = gate_value + frame_size * 2;
+
+ for (int i = 0; i < frame_size; i++) {
+ r_value_update_gate = update_gate[i];
+ r_value_frame_state = frame_state[i];
+ if (prev_output_value) {
+ r_prev_out = prev_output_value[i];
}
- opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
- active_node);
+ op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+ r_output, active_node);
- frameState[i] = rValueFrameState;
- outputValue[i] = rOutput;
+ frame_state[i] = r_value_frame_state;
+ output_value[i] = r_output;
}
}
template
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
- T *resetOutputValue, T *prevOutputValue,
- int frameSize,
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+ T *gate_value, T *reset_output_value,
+ T *prev_output_value, int frame_size,
activation_mode_t active_gate) {
#ifdef __AVX__
- __m256 rValueUpdateGate;
- __m256 rValueResetGate;
- __m256 rValueResetOutput;
- __m256 rPrevOut = _mm256_set1_ps(0.0f);
- __m256 *updateGate = (__m256 *)gateValue;
- __m256 *resetGate = (__m256 *)(gateValue + frameSize);
-
- for (int i = 0; i < frameSize / 8; i++) {
- rValueUpdateGate = updateGate[i];
- rValueResetGate = resetGate[i];
- if (prevOutputValue) {
- rPrevOut = ((__m256 *)prevOutputValue)[i];
+ __m256 r_value_update_gate;
+ __m256 r_value_reset_gate;
+ __m256 r_value_reset_output;
+ __m256 r_prev_out = _mm256_set1_ps(0.0f);
+ __m256 *update_gate = (__m256 *)gate_value;
+ __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+ for (int i = 0; i < frame_size / 8; i++) {
+ r_value_update_gate = update_gate[i];
+ r_value_reset_gate = reset_gate[i];
+ if (prev_output_value) {
+ r_prev_out = ((__m256 *)prev_output_value)[i];
}
- opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
- rValueResetOutput, active_gate);
+ op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+ r_value_reset_output, active_gate);
- updateGate[i] = rValueUpdateGate;
- resetGate[i] = rValueResetGate;
- ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+ update_gate[i] = r_value_update_gate;
+ reset_gate[i] = r_value_reset_gate;
+ ((__m256 *)reset_output_value)[i] = r_value_reset_output;
}
#endif
}
template
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
- T *prevOutputValue, T *outputValue,
- int frameSize,
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+ T *gate_value, T *prev_output_value,
+ T *output_value, int frame_size,
activation_mode_t active_node) {
#ifdef __AVX__
- __m256 rValueUpdateGate;
- __m256 rValueFrameState;
- __m256 rPrevOut = _mm256_set1_ps(0.0f);
- __m256 rOutput;
- __m256 *updateGate = (__m256 *)gateValue;
- __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
-
- for (int i = 0; i < frameSize / 8; i++) {
- rValueUpdateGate = updateGate[i];
- rValueFrameState = frameState[i];
- if (prevOutputValue) {
- rPrevOut = ((__m256 *)prevOutputValue)[i];
+ __m256 r_value_update_gate;
+ __m256 r_value_frame_state;
+ __m256 r_prev_out = _mm256_set1_ps(0.0f);
+ __m256 r_output;
+ __m256 *update_gate = (__m256 *)gate_value;
+ __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+ for (int i = 0; i < frame_size / 8; i++) {
+ r_value_update_gate = update_gate[i];
+ r_value_frame_state = frame_state[i];
+ if (prev_output_value) {
+ r_prev_out = ((__m256 *)prev_output_value)[i];
}
- opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
- active_node);
+ op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+ r_output, active_node);
- frameState[i] = rValueFrameState;
- ((__m256 *)outputValue)[i] = rOutput;
+ frame_state[i] = r_value_frame_state;
+ ((__m256 *)output_value)[i] = r_output;
}
#endif
}
template
-inline void forward_reset_output(OpResetOutput opResetOutput,
- hl_gru_value value, int frameSize,
- int batchSize, activation_mode_t active_gate) {
- for (int b = 0; b < batchSize; b++) {
- if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void forward_reset_output(OpResetOutput op_reset_output,
+ hl_gru_value value, int frame_size,
+ int batch_size,
+ activation_mode_t active_gate) {
+ for (int b = 0; b < batch_size; b++) {
+ if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output(
- opResetOutput, value.gateValue, value.resetOutputValue,
- value.prevOutValue, frameSize, active_gate);
+ op_reset_output, value.gate_value, value.reset_output_value,
+ value.prev_out_value, frame_size, active_gate);
} else {
hl_naive_gru_forward_reset_output(
- opResetOutput, value.gateValue, value.resetOutputValue,
- value.prevOutValue, frameSize, active_gate);
+ op_reset_output, value.gate_value, value.reset_output_value,
+ value.prev_out_value, frame_size, active_gate);
}
- value.gateValue += frameSize * 3;
- value.resetOutputValue += frameSize;
- if (value.prevOutValue) {
- value.prevOutValue += frameSize;
+ value.gate_value += frame_size * 3;
+ value.reset_output_value += frame_size;
+ if (value.prev_out_value) {
+ value.prev_out_value += frame_size;
}
}
}
template
-inline void forward_final_output(OpFinalOutput opFinalOutput,
- hl_gru_value value, int frameSize,
- int batchSize, activation_mode_t active_node) {
- for (int b = 0; b < batchSize; b++) {
- if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
- hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
- value.prevOutValue, value.outputValue,
- frameSize, active_node);
+inline void forward_final_output(OpFinalOutput op_final_output,
+ hl_gru_value value, int frame_size,
+ int batch_size,
+ activation_mode_t active_node) {
+ for (int b = 0; b < batch_size; b++) {
+ if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+ hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+ value.prev_out_value, value.output_value,
+ frame_size, active_node);
} else {
- hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
- value.prevOutValue, value.outputValue,
- frameSize, active_node);
+ hl_naive_gru_forward_final_output(
+ op_final_output, value.gate_value, value.prev_out_value,
+ value.output_value, frame_size, active_node);
}
- value.gateValue += frameSize * 3;
- value.outputValue += frameSize;
- if (value.prevOutValue) {
- value.prevOutValue += frameSize;
+ value.gate_value += frame_size * 3;
+ value.output_value += frame_size;
+ if (value.prev_out_value) {
+ value.prev_out_value += frame_size;
}
}
}
template
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
- T *gateGrad, T *prevOutValue,
- T *prevOutGrad, T *outputGrad,
- int frameSize,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+ T *gate_grad, T *prev_out_value,
+ T *prev_out_grad, T *output_grad,
+ int frame_size,
activation_mode_t active_node) {
- T rUpdateGateValue;
- T rUpdateGateGrad;
- T rFrameStateValue;
- T rFrameStateGrad;
- T rOutGrad;
- T rPrevOutValue = 0;
- T rPrevOutGrad = 0;
- T *updateGateValue = gateValue;
- T *updateGateGrad = gateGrad;
- T *frameStateValue = gateValue + frameSize * 2;
- T *frameStateGrad = gateGrad + frameSize * 2;
-
- for (int i = 0; i < frameSize; i++) {
- rUpdateGateValue = updateGateValue[i];
- rFrameStateValue = frameStateValue[i];
- rOutGrad = outputGrad[i];
- if (prevOutValue) {
- rPrevOutValue = prevOutValue[i];
+ T r_update_gate_value;
+ T r_update_gate_grad;
+ T r_frame_state_value;
+ T r_frame_state_grad;
+ T r_out_grad;
+ T r_prev_out_value = 0;
+ T r_prev_out_grad = 0;
+ T *update_gate_value = gate_value;
+ T *update_gate_grad = gate_grad;
+ T *frame_state_value = gate_value + frame_size * 2;
+ T *frame_state_grad = gate_grad + frame_size * 2;
+
+ for (int i = 0; i < frame_size; i++) {
+ r_update_gate_value = update_gate_value[i];
+ r_frame_state_value = frame_state_value[i];
+ r_out_grad = output_grad[i];
+ if (prev_out_value) {
+ r_prev_out_value = prev_out_value[i];
}
- if (prevOutGrad) {
- rPrevOutGrad = prevOutGrad[i];
+ if (prev_out_grad) {
+ r_prev_out_grad = prev_out_grad[i];
}
- opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
- rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
- active_node);
+ op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+ r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+ r_out_grad, active_node);
- updateGateGrad[i] = rUpdateGateGrad;
- frameStateGrad[i] = rFrameStateGrad;
- if (prevOutGrad) {
- prevOutGrad[i] = rPrevOutGrad;
+ update_gate_grad[i] = r_update_gate_grad;
+ frame_state_grad[i] = r_frame_state_grad;
+ if (prev_out_grad) {
+ prev_out_grad[i] = r_prev_out_grad;
}
}
}
template
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
- T *gateGrad, T *prevOutValue,
- T *prevOutGrad, T *resetOutputGrad,
- int frameSize,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+ T *gate_grad, T *prev_out_value,
+ T *prev_out_grad, T *reset_output_grad,
+ int frame_size,
activation_mode_t active_gate) {
- T rUpdateGateValue;
- T rUpdateGateGrad;
- T rResetGateValue;
- T rResetGateGrad;
- T rResetOutputGrad = 0;
- T rPrevOutValue = 0;
- T rPrevOutGrad = 0;
- T *updateGateValue = gateValue;
- T *updateGateGrad = gateGrad;
- T *resetGateValue = gateValue + frameSize;
- T *resetGateGrad = gateGrad + frameSize;
-
- for (int i = 0; i < frameSize; i++) {
- rUpdateGateValue = updateGateValue[i];
- rUpdateGateGrad = updateGateGrad[i];
- rResetGateValue = resetGateValue[i];
-
- if (prevOutValue && prevOutGrad) {
- rResetOutputGrad = resetOutputGrad[i];
+ T r_update_gate_value;
+ T r_update_gate_grad;
+ T r_reset_gate_value;
+ T r_reset_gate_grad;
+ T r_reset_output_grad = 0;
+ T r_prev_out_value = 0;
+ T r_prev_out_grad = 0;
+ T *update_gate_value = gate_value;
+ T *update_gate_grad = gate_grad;
+ T *reset_gate_value = gate_value + frame_size;
+ T *reset_gate_grad = gate_grad + frame_size;
+
+ for (int i = 0; i < frame_size; i++) {
+ r_update_gate_value = update_gate_value[i];
+ r_update_gate_grad = update_gate_grad[i];
+ r_reset_gate_value = reset_gate_value[i];
+
+ if (prev_out_value && prev_out_grad) {
+ r_reset_output_grad = reset_output_grad[i];
}
- if (prevOutValue) {
- rPrevOutValue = prevOutValue[i];
+ if (prev_out_value) {
+ r_prev_out_value = prev_out_value[i];
}
- if (prevOutGrad) {
- rPrevOutGrad = prevOutGrad[i];
+ if (prev_out_grad) {
+ r_prev_out_grad = prev_out_grad[i];
}
- opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
- rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
- active_gate);
+ op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+ r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+ r_reset_output_grad, active_gate);
- updateGateGrad[i] = rUpdateGateGrad;
- resetGateGrad[i] = rResetGateGrad;
- if (prevOutGrad) {
- prevOutGrad[i] = rPrevOutGrad;
+ update_gate_grad[i] = r_update_gate_grad;
+ reset_gate_grad[i] = r_reset_gate_grad;
+ if (prev_out_grad) {
+ prev_out_grad[i] = r_prev_out_grad;
}
}
}
template
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
- T *gateGrad, T *prevOutValue,
- T *prevOutGrad, T *outputGrad,
- int frameSize,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+ T *gate_grad, T *prev_out_value,
+ T *prev_out_grad, T *output_grad,
+ int frame_size,
activation_mode_t active_node) {
#ifdef __AVX__
- __m256 rUpdateGateValue;
- __m256 rUpdateGateGrad;
- __m256 rFrameStateValue;
- __m256 rFrameStateGrad;
- __m256 rOutGrad;
- __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
- __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
- __m256 *updateGateValue = (__m256 *)gateValue;
- __m256 *updateGateGrad = (__m256 *)gateGrad;
- __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
- __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
-
- for (int i = 0; i < frameSize / 8; i++) {
- rUpdateGateValue = updateGateValue[i];
- rFrameStateValue = frameStateValue[i];
- rOutGrad = ((__m256 *)outputGrad)[i];
- if (prevOutValue) {
- rPrevOutValue = ((__m256 *)prevOutValue)[i];
+ __m256 r_update_gate_value;
+ __m256 r_update_gate_grad;
+ __m256 r_frame_state_value;
+ __m256 r_frame_state_grad;
+ __m256 r_out_grad;
+ __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+ __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+ __m256 *update_gate_value = (__m256 *)gate_value;
+ __m256 *update_gate_grad = (__m256 *)gate_grad;
+ __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+ __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+ for (int i = 0; i < frame_size / 8; i++) {
+ r_update_gate_value = update_gate_value[i];
+ r_frame_state_value = frame_state_value[i];
+ r_out_grad = ((__m256 *)output_grad)[i];
+ if (prev_out_value) {
+ r_prev_out_value = ((__m256 *)prev_out_value)[i];
}
- if (prevOutGrad) {
- rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+ if (prev_out_grad) {
+ r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
}
- opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
- rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
- active_node);
+ op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+ r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+ r_out_grad, active_node);
- updateGateGrad[i] = rUpdateGateGrad;
- frameStateGrad[i] = rFrameStateGrad;
- if (prevOutGrad) {
- ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+ update_gate_grad[i] = r_update_gate_grad;
+ frame_state_grad[i] = r_frame_state_grad;
+ if (prev_out_grad) {
+ ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
}
}
#endif
}
template
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
- T *gateGrad, T *prevOutValue,
- T *prevOutGrad, T *resetOutputGrad,
- int frameSize,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+ T *gate_grad, T *prev_out_value,
+ T *prev_out_grad, T *reset_output_grad,
+ int frame_size,
activation_mode_t active_gate) {
#ifdef __AVX__
- __m256 rUpdateGateValue;
- __m256 rUpdateGateGrad;
- __m256 rResetGateValue;
- __m256 rResetGateGrad;
- __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
- __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
- __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
- __m256 *updateGateValue = (__m256 *)gateValue;
- __m256 *updateGateGrad = (__m256 *)gateGrad;
- __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
- __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
-
- for (int i = 0; i < frameSize / 8; i++) {
- rUpdateGateValue = updateGateValue[i];
- rUpdateGateGrad = updateGateGrad[i];
- rResetGateValue = resetGateValue[i];
-
- if (prevOutValue && prevOutGrad) {
- rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+ __m256 r_update_gate_value;
+ __m256 r_update_gate_grad;
+ __m256 r_reset_gate_value;
+ __m256 r_reset_gate_grad;
+ __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+ __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+ __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+ __m256 *update_gate_value = (__m256 *)gate_value;
+ __m256 *update_gate_grad = (__m256 *)gate_grad;
+ __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+ __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+ for (int i = 0; i < frame_size / 8; i++) {
+ r_update_gate_value = update_gate_value[i];
+ r_update_gate_grad = update_gate_grad[i];
+ r_reset_gate_value = reset_gate_value[i];
+
+ if (prev_out_value && prev_out_grad) {
+ r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
}
- if (prevOutValue) {
- rPrevOutValue = ((__m256 *)prevOutValue)[i];
+ if (prev_out_value) {
+ r_prev_out_value = ((__m256 *)prev_out_value)[i];
}
- if (prevOutGrad) {
- rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+ if (prev_out_grad) {
+ r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
}
- opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
- rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
- active_gate);
+ op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+ r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+ r_reset_output_grad, active_gate);
- updateGateGrad[i] = rUpdateGateGrad;
- resetGateGrad[i] = rResetGateGrad;
- if (prevOutGrad) {
- ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+ update_gate_grad[i] = r_update_gate_grad;
+ reset_gate_grad[i] = r_reset_gate_grad;
+ if (prev_out_grad) {
+ ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
}
}
#endif
}
template
-inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value value,
- hl_gru_grad grad, int frameSize,
- int batchSize, activation_mode_t active_node) {
- for (int b = 0; b < batchSize; b++) {
- if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_state_grad(OpStateGrad op_state_grad,
+ hl_gru_value value, hl_gru_grad grad,
+ int frame_size, int batch_size,
+ activation_mode_t active_node) {
+ for (int b = 0; b < batch_size; b++) {
+ if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_state_grad(
- opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
- grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+ op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+ grad.prev_out_grad, grad.output_grad, frame_size, active_node);
} else {
hl_naive_gru_backward_state_grad(
- opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
- grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+ op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+ grad.prev_out_grad, grad.output_grad, frame_size, active_node);
}
- value.gateValue += frameSize * 3;
- if (value.prevOutValue) {
- value.prevOutValue += frameSize;
+ value.gate_value += frame_size * 3;
+ if (value.prev_out_value) {
+ value.prev_out_value += frame_size;
}
- grad.gateGrad += frameSize * 3;
- grad.outputGrad += frameSize;
- if (grad.prevOutGrad) {
- grad.prevOutGrad += frameSize;
+ grad.gate_grad += frame_size * 3;
+ grad.output_grad += frame_size;
+ if (grad.prev_out_grad) {
+ grad.prev_out_grad += frame_size;
}
}
}
template
-inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value value,
- hl_gru_grad grad, int frameSize,
- int batchSize, activation_mode_t active_gate) {
- for (int b = 0; b < batchSize; b++) {
- if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+ hl_gru_value value, hl_gru_grad grad,
+ int frame_size, int batch_size,
+ activation_mode_t active_gate) {
+ for (int b = 0; b < batch_size; b++) {
+ if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_reset_grad(
- opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
- grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+ op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+ grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
} else {
hl_naive_gru_backward_reset_grad(
- opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
- grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+ op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+ grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
}
- value.gateValue += frameSize * 3;
- if (value.prevOutValue) {
- value.prevOutValue += frameSize;
+ value.gate_value += frame_size * 3;
+ if (value.prev_out_value) {
+ value.prev_out_value += frame_size;
}
- grad.gateGrad += frameSize * 3;
- grad.resetOutputGrad += frameSize;
- if (grad.prevOutGrad) {
- grad.prevOutGrad += frameSize;
+ grad.gate_grad += frame_size * 3;
+ grad.reset_output_grad += frame_size;
+ if (grad.prev_out_grad) {
+ grad.prev_out_grad += frame_size;
}
}
}
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
index 6441c648b048422c110872a85aa8cb719f11a8d7..d2edcb7f258b387530799b967fc0fff61acc5b83 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -27,174 +27,174 @@ namespace math {
namespace detail {
/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
*/
-template
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
- T *gateValue, T *resetOutputValue,
- T *prevOutputValue, int frameSize,
- int batchSize,
+template
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+ T *gate_value, T *reset_output_value,
+ T *prev_output_value, int frame_size,
+ int batch_size,
activation_mode_t active_gate) {
- const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
- if (frameIdx >= frameSize) return;
-
- int batchIdx = 0;
- if (isBatch) {
- batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
- if (batchIdx >= batchSize) return;
- gateValue += batchIdx * 3 * frameSize;
- resetOutputValue += batchIdx * frameSize;
+ const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (frame_idx >= frame_size) return;
+
+ int batch_idx = 0;
+ if (is_batch) {
+ batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+ if (batch_idx >= batch_size) return;
+ gate_value += batch_idx * 3 * frame_size;
+ reset_output_value += batch_idx * frame_size;
}
- T rPrevOut = 0;
- T rValueResetOutput;
- T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
- T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+ T r_prev_out = 0;
+ T r_value_reset_output;
+ T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+ T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
- if (prevOutputValue) {
- if (isBatch) prevOutputValue += batchIdx * frameSize;
- rPrevOut = prevOutputValue[frameIdx];
+ if (prev_output_value) {
+ if (is_batch) prev_output_value += batch_idx * frame_size;
+ r_prev_out = prev_output_value[frame_idx];
}
- opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
- active_gate);
+ op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+ r_value_reset_output, active_gate);
- gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
- gateValue[frameIdx + frameSize * 1] = rValueResetGate;
- resetOutputValue[frameIdx] = rValueResetOutput;
+ gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+ gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+ reset_output_value[frame_idx] = r_value_reset_output;
}
/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
*/
-template
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
- T *gateValue, T *prevOutputValue,
- T *outputValue, int frameSize,
- int batchSize,
+template
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+ T *gate_value, T *prev_output_value,
+ T *output_value, int frame_size,
+ int batch_size,
activation_mode_t active_node) {
- const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
- if (frameIdx >= frameSize) return;
- int batchIdx = 0;
- if (isBatch) {
- batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
- if (batchIdx >= batchSize) return;
- gateValue += batchIdx * 3 * frameSize;
- outputValue += batchIdx * frameSize;
+ const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (frame_idx >= frame_size) return;
+ int batch_idx = 0;
+ if (is_batch) {
+ batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+ if (batch_idx >= batch_size) return;
+ gate_value += batch_idx * 3 * frame_size;
+ output_value += batch_idx * frame_size;
}
- T rOutput;
- T rPrevOut = 0;
- T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
- T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+ T r_output;
+ T r_prev_out = 0;
+ T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+ T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
- if (prevOutputValue) {
- if (isBatch) prevOutputValue += batchIdx * frameSize;
- rPrevOut = prevOutputValue[frameIdx];
+ if (prev_output_value) {
+ if (is_batch) prev_output_value += batch_idx * frame_size;
+ r_prev_out = prev_output_value[frame_idx];
}
- opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
- active_node);
+ op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+ r_output, active_node);
- gateValue[frameIdx + frameSize * 2] = rValueFrameState;
- outputValue[frameIdx] = rOutput;
+ gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+ output_value[frame_idx] = r_output;
}
/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
*/
-template
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
- T *gateGrad, T *prevOutValue,
- T *prevOutGrad, T *outputGrad,
- int frameSize, int batchSize,
+template
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+ T *gate_grad, T *prev_out_value,
+ T *prev_out_grad, T *output_grad,
+ int frame_size, int batch_size,
activation_mode_t active_node) {
- const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
- if (frameIdx >= frameSize) return;
- int batchIdx = 0;
- if (isBatch) {
- batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
- if (batchIdx >= batchSize) return;
- gateValue += batchIdx * 3 * frameSize;
- gateGrad += batchIdx * 3 * frameSize;
- outputGrad += batchIdx * frameSize;
+ const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (frame_idx >= frame_size) return;
+ int batch_idx = 0;
+ if (is_batch) {
+ batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+ if (batch_idx >= batch_size) return;
+ gate_value += batch_idx * 3 * frame_size;
+ gate_grad += batch_idx * 3 * frame_size;
+ output_grad += batch_idx * frame_size;
}
- T rUpdateGateGrad;
- T rFrameStateGrad;
- T rPrevOutValue = 0;
- T rPrevOutGrad = 0;
- T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
- T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
- T rOutGrad = outputGrad[frameIdx];
+ T r_update_gate_grad;
+ T r_frame_state_grad;
+ T r_prev_out_value = 0;
+ T r_prev_out_grad = 0;
+ T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+ T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+ T r_out_grad = output_grad[frame_idx];
- if (prevOutValue && prevOutGrad) {
- if (isBatch) prevOutValue += batchIdx * frameSize;
- rPrevOutValue = prevOutValue[frameIdx];
+ if (prev_out_value && prev_out_grad) {
+ if (is_batch) prev_out_value += batch_idx * frame_size;
+ r_prev_out_value = prev_out_value[frame_idx];
- if (isBatch) prevOutGrad += batchIdx * frameSize;
- rPrevOutGrad = prevOutGrad[frameIdx];
+ if (is_batch) prev_out_grad += batch_idx * frame_size;
+ r_prev_out_grad = prev_out_grad[frame_idx];
}
- opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
- rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
- active_node);
+ op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+ r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+ r_out_grad, active_node);
- gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
- gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
- if (prevOutGrad) {
- prevOutGrad[frameIdx] = rPrevOutGrad;
+ gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+ gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+ if (prev_out_grad) {
+ prev_out_grad[frame_idx] = r_prev_out_grad;
}
}
/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
*/
-template
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
- T *gateGrad, T *prevOutValue,
- T *prevOutGrad, T *resetOutputGrad,
- int frameSize, int batchSize,
+template
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+ T *gate_grad, T *prev_out_value,
+ T *prev_out_grad, T *reset_output_grad,
+ int frame_size, int batch_size,
activation_mode_t active_gate) {
- const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
- if (frameIdx >= frameSize) return;
- int batchIdx = 0;
- if (isBatch) {
- batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
- if (batchIdx >= batchSize) return;
- gateValue += batchIdx * 3 * frameSize;
- gateGrad += batchIdx * 3 * frameSize;
- resetOutputGrad += batchIdx * frameSize;
+ const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (frame_idx >= frame_size) return;
+ int batch_idx = 0;
+ if (is_batch) {
+ batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+ if (batch_idx >= batch_size) return;
+ gate_value += batch_idx * 3 * frame_size;
+ gate_grad += batch_idx * 3 * frame_size;
+ reset_output_grad += batch_idx * frame_size;
}
- T rResetGateGrad;
- T rPrevOutValue = 0;
- T rPrevOutGrad = 0;
- T rResetOutputGrad = 0;
- T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
- T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
- T rResetGateValue = gateValue[frameIdx + frameSize * 1];
-
- if (prevOutValue && prevOutGrad) {
- if (isBatch) prevOutValue += batchIdx * frameSize;
- if (isBatch) prevOutGrad += batchIdx * frameSize;
- rPrevOutValue = prevOutValue[frameIdx];
- rPrevOutGrad = prevOutGrad[frameIdx];
- rResetOutputGrad = resetOutputGrad[frameIdx];
+ T r_reset_gate_grad;
+ T r_prev_out_value = 0;
+ T r_prev_out_grad = 0;
+ T r_reset_output_grad = 0;
+ T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+ T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+ T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+ if (prev_out_value && prev_out_grad) {
+ if (is_batch) prev_out_value += batch_idx * frame_size;
+ if (is_batch) prev_out_grad += batch_idx * frame_size;
+ r_prev_out_value = prev_out_value[frame_idx];
+ r_prev_out_grad = prev_out_grad[frame_idx];
+ r_reset_output_grad = reset_output_grad[frame_idx];
}
- opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
- rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
- active_gate);
+ op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+ r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+ r_reset_output_grad, active_gate);
- gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
- gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
- if (prevOutGrad) {
- prevOutGrad[frameIdx] = rPrevOutGrad;
+ gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+ gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+ if (prev_out_grad) {
+ prev_out_grad[frame_idx] = r_prev_out_grad;
}
}
} // namespace detail
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
index 8a681d8d8bced72e1296f863489f6ccbc7913167..acd84be01db9ddaf06d165d8be353b253f324dd2 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -28,23 +28,25 @@ namespace forward {
template
class gru_resetOutput {
public:
- HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
- T &valueResetOutput, activation_mode_t actGate) {
- valueUpdateGate = activation(valueUpdateGate, actGate);
- valueResetGate = activation(valueResetGate, actGate);
- valueResetOutput = prevOut * valueResetGate;
+ HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+ T &prev_out, T &value_reset_output,
+ activation_mode_t act_gate) {
+ value_update_gate = activation(value_update_gate, act_gate);
+ value_reset_gate = activation(value_reset_gate, act_gate);
+ value_reset_output = prev_out * value_reset_gate;
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
- HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
- __m256 &prevOut, __m256 &valueResetOutput,
- activation_mode_t actGate) {
- valueUpdateGate = activation(valueUpdateGate, actGate);
- valueResetGate = activation(valueResetGate, actGate);
- valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+ HOSTDEVICE void operator()(__m256 &value_update_gate,
+ __m256 &value_reset_gate, __m256 &prev_out,
+ __m256 &value_reset_output,
+ activation_mode_t act_gate) {
+ value_update_gate = activation(value_update_gate, act_gate);
+ value_reset_gate = activation(value_reset_gate, act_gate);
+ value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
}
#endif
#endif
@@ -53,24 +55,26 @@ class gru_resetOutput {
template
class gru_finalOutput {
public:
- HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
- T &valueOutput, activation_mode_t actInput) {
- valueFrameState = activation(valueFrameState, actInput);
- valueOutput = prevOut - (valueUpdateGate * prevOut) +
- (valueUpdateGate * valueFrameState);
+ HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+ T &prev_out, T &value_output,
+ activation_mode_t act_input) {
+ value_frame_state = activation(value_frame_state, act_input);
+ value_output = prev_out - (value_update_gate * prev_out) +
+ (value_update_gate * value_frame_state);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
- HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
- __m256 &prevOut, __m256 &valueOutput,
- activation_mode_t actInput) {
- valueFrameState = activation(valueFrameState, actInput);
- valueOutput = _mm256_add_ps(
- _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
- _mm256_mul_ps(valueUpdateGate, valueFrameState));
+ HOSTDEVICE void operator()(__m256 &value_update_gate,
+ __m256 &value_frame_state, __m256 &prev_out,
+ __m256 &value_output,
+ activation_mode_t act_input) {
+ value_frame_state = activation(value_frame_state, act_input);
+ value_output = _mm256_add_ps(
+ _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+ _mm256_mul_ps(value_update_gate, value_frame_state));
}
#endif
#endif
@@ -82,34 +86,37 @@ namespace backward {
template
class gru_stateGrad {
public:
- HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
- T &valueFrameState, T &gradFrameState,
- T &valuePrevOut, T &gradPrevOut, T &gradOutput,
- activation_mode_t actInput) {
- gradUpdateGate = (gradOutput * valueFrameState);
- gradUpdateGate -= (gradOutput * valuePrevOut);
- gradPrevOut -= (gradOutput * valueUpdateGate);
- gradPrevOut += gradOutput;
- gradFrameState =
- activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+ HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+ T &value_frame_state, T &grad_frame_state,
+ T &value_prev_out, T &grad_prev_out,
+ T &grad_output, activation_mode_t act_input) {
+ grad_update_gate = (grad_output * value_frame_state);
+ grad_update_gate -= (grad_output * value_prev_out);
+ grad_prev_out -= (grad_output * value_update_gate);
+ grad_prev_out += grad_output;
+ grad_frame_state = activation(grad_output * value_update_gate,
+ value_frame_state, act_input);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
- HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
- __m256 &valueFrameState, __m256 &gradFrameState,
- __m256 &valuePrevOut, __m256 &gradPrevOut,
- __m256 &gradOutput, activation_mode_t actInput) {
- gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
- gradUpdateGate =
- _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
- gradPrevOut = _mm256_add_ps(
- _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
- gradOutput);
- gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
- valueFrameState, actInput);
+ HOSTDEVICE void operator()(__m256 &value_update_gate,
+ __m256 &grad_update_gate,
+ __m256 &value_frame_state,
+ __m256 &grad_frame_state, __m256 &value_prev_out,
+ __m256 &grad_prev_out, __m256 &grad_output,
+ activation_mode_t act_input) {
+ grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+ grad_update_gate = _mm256_sub_ps(
+ grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+ grad_prev_out = _mm256_add_ps(
+ _mm256_sub_ps(grad_prev_out,
+ _mm256_mul_ps(grad_output, value_update_gate)),
+ grad_output);
+ grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+ value_frame_state, act_input);
}
#endif
#endif
@@ -118,30 +125,32 @@ class gru_stateGrad {
template
class gru_resetGrad {
public:
- HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
- T &valueResetGate, T &gradResetGate,
- T &valuePrevOut, T &gradPrevOut,
- T &gradResetOutput, activation_mode_t actGate) {
- gradResetGate = (gradResetOutput * valuePrevOut);
- gradPrevOut += (gradResetOutput * valueResetGate);
- gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
- gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+ HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+ T &value_reset_gate, T &grad_reset_gate,
+ T &value_prev_out, T &grad_prev_out,
+ T &grad_reset_output, activation_mode_t act_gate) {
+ grad_reset_gate = (grad_reset_output * value_prev_out);
+ grad_prev_out += (grad_reset_output * value_reset_gate);
+ grad_update_gate =
+ activation(grad_update_gate, value_update_gate, act_gate);
+ grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
- HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
- __m256 &valueResetGate, __m256 &gradResetGate,
- __m256 &valuePrevOut, __m256 &gradPrevOut,
- __m256 &gradResetOutput,
- activation_mode_t actGate) {
- gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
- gradPrevOut = _mm256_add_ps(gradPrevOut,
- _mm256_mul_ps(gradResetOutput, valueResetGate));
- gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
- gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+ HOSTDEVICE void operator()(__m256 &value_update_gate,
+ __m256 &grad_update_gate, __m256 &value_reset_gate,
+ __m256 &grad_reset_gate, __m256 &value_prev_out,
+ __m256 &grad_prev_out, __m256 &grad_reset_output,
+ activation_mode_t act_gate) {
+ grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+ grad_prev_out = _mm256_add_ps(
+ grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+ grad_update_gate =
+ activation(grad_update_gate, value_update_gate, act_gate);
+ grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
}
#endif
#endif
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index 125af449d3f700e24be5e4b7615c3b0e03fd4e5b..ae4e47b014a9cd1f656dd9332086aa4d1b7cbb52 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,29 +21,29 @@ namespace math {
template
struct GRUUnitFunctor {
static void compute(const platform::DeviceContext &context,
- hl_gru_value value, int frameSize, int batchSize,
+ hl_gru_value value, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate) {
#ifndef __NVCC__
- if (value.prevOutValue) {
+ if (value.prev_out_value) {
math::gemm(
- context, false, false, batchSize, frameSize * 2, frameSize, 1,
- value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
- value.gateValue, frameSize * 3);
+ context, false, false, batch_size, frame_size * 2, frame_size, 1,
+ value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+ 1, value.gate_value, frame_size * 3);
}
detail::forward_reset_output(detail::forward::gru_resetOutput(), value,
- frameSize, batchSize, active_gate);
+ frame_size, batch_size, active_gate);
- if (value.prevOutValue) {
+ if (value.prev_out_value) {
math::gemm