提交 c339e1b7 编写于 作者: C chengduoZH

remove conflict

......@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
...
......@@ -42,7 +42,7 @@ before_install:
script:
- |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
- |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
......
......@@ -133,6 +133,8 @@ include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/nccl)
include(external/cares)
include(external/grpc)
include(cudnn) # set cudnn libraries, must before configure
include(configure) # add paddle env configuration
......
......@@ -29,7 +29,7 @@ RUN apt-get update && \
automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \
net-tools libtool && \
apt-get clean -y
# Install Go and glide
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: c-ares is needed when linking with grpc.
SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
ExternalProject_Add(
extern_cares
GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
GIT_TAG "cares-1_13_0"
PREFIX ${CARES_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
BUILD_IN_SOURCE 1
BUILD_COMMAND make
INSTALL_COMMAND make install
)
ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
"${CARES_INSTALL_DIR}/lib/libcares.a")
include_directories(${CARES_INCLUDE_DIR})
ADD_DEPENDENCIES(cares extern_cares)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
IF(APPLE)
SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
ELSE()
SET(BUILD_CMD make)
ENDIF()
ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
GIT_TAG "v1.7.x"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
# NOTE(yuyang18):
# Disable -Werror, otherwise the compile will fail in MacOS.
# It seems that we cannot configure that by make command.
# Just dry run make command and remove `-Werror`, then use a shell to run make commands
BUILD_COMMAND ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
)
# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgpr.a")
ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
include_directories(${GRPC_INCLUDE_DIR})
ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
......@@ -15,7 +15,18 @@
INCLUDE(ExternalProject)
# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
FIND_PACKAGE(Protobuf QUIET)
SET(PROTOBUF_FOUND "OFF")
macro(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME} CACHE)
UNSET(${VAR_NAME})
endmacro()
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(PROTOBUF_FOUND)
UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
UNSET_VAR(PROTOBUF_LITE_LIBRARY)
UNSET_VAR(PROTOBUF_LIBRARY)
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS)
......@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
# FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
# make `protobuf_generate_cpp` happy.
SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
FOREACH(dep ${protobuf_DEPS})
ADD_DEPENDENCIES(protobuf ${dep})
ADD_DEPENDENCIES(protobuf_lite ${dep})
......@@ -128,11 +138,11 @@ endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
SET_PROTOBUF_VERSION()
......
......@@ -50,6 +50,8 @@ ExternalProject_Add(
)
LIST(APPEND external_project_dependencies zlib)
ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
......
......@@ -467,3 +467,50 @@ function(py_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction()
# grpc_library generate grpc code using grpc_cpp_plugin and protoc
# then build the generated protobuf code and grpc code with your
# implementation source codes together. Use SRCS argument for your
# implementation source files and PROTO argument for your .proto
# files.
#
# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
function(grpc_library TARGET_NAME)
set(oneValueArgs PROTO)
set(multiValueArgs SRCS DEPS)
set(options "")
cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
message(STATUS "generating grpc ${grpc_library_PROTO}")
get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
add_custom_command(
OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
--plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
# FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
# as compiler warnings instead of error. Should try remove the warnings also.
set_source_files_properties(
${grpc_grpc_srcs}
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
set_source_files_properties(
${grpc_library_SRCS}
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
endfunction()
......@@ -54,7 +54,7 @@ img_conv
.. _api_v2.layer_context_projection:
context_projection
context_projection
------------------
.. autoclass:: paddle.v2.layer.context_projection
:noindex:
......@@ -70,7 +70,7 @@ Image Pooling Layer
img_pool
--------
.. autoclass:: paddle.v2.layer.img_pool
:noindex:
:noindex:
spp
---
......@@ -104,7 +104,7 @@ sum_to_one_norm
---------------
.. autoclass:: paddle.v2.layer.sum_to_one_norm
:noindex:
cross_channel_norm
------------------
.. autoclass:: paddle.v2.layer.cross_channel_norm
......@@ -114,7 +114,7 @@ row_l2_norm
-----------
.. autoclass:: paddle.v2.layer.row_l2_norm
:noindex:
Recurrent Layers
================
......@@ -415,6 +415,13 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex
:noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers
==========================
......
......@@ -2,106 +2,70 @@
## Abstract
PaddlePaddle v0.10.0 uses the "trainer-parameter server"
architecture. We run multiple replicated instances of trainers (runs
the same code written by the user) and parameter servers for
distributed training. This architecture served us well, but has some
limitations:
PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
1. Need to write special code to handle tasks which should only be run
by a single trainer. E.g., initializing model and saving model.
1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
2. Model parallelism is hard: need to write if-else branches conditioned
on the trainer ID to partition model onto each trainer, and manually
write the inter-model-shard communication code.
2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
3. The user can not directly specify the parameter update rule: need
to modify the parameter server C++ code and compile a new
binary. This adds complication for researchers: A lot of extra
effort is required. Besides, the training job submission program
may not allow running arbitrary binaries.
3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
This design doc discusses PaddlePaddle's new distributed training
architecture that addresses the above limitations.
This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
## Analysis
We will assume the user writes the trainer program by Python, the same
analysis holds if the trainer program is written in C++.
The assumption is that the user writes the trainer program in either Python or C++.
### Limitation 1
If we look at the Python code that the user writes, there are two
kinds of functionalities:
There are two basic functionalities in the trainer program:
- The training logic such as load / save model and print log.
- The neural network definition such as the definition of the data
layer, the fully connected layer, the cost function and the
1. The training logic such as loading / saving the model and printing out the logs.
2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
optimizer.
When we training with PaddlePaddle v0.10.0 distributedly, multiple
replicated Python instances are running on different nodes: both the
training logic and the neural network computation is replicated.
When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
training logic as well as the neural network computation logic, is replicated.
The tasks that should only run once all belong to the training logic,
if we only replicate the neural network computation, but do **not**
replicate the training logic, the limitation could be solved.
The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
replicate the training logic, the limitation mentioned above can be avoided.
### Limitation 2
Model parallelism means running a single model on multiple nodes by
partitioning the model onto different nodes and managing the
inter-model-shard communications.
Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
inter-model-shard communication between nodes.
PaddlePaddle should be able to modify the nerual network computation
definition to support model parallelism automatically. However, the
computation is only specified in Python code, and PaddlePaddle can not
modify Python code.
PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
Just like compiler uses a intermediate representation (IR) so that
programmer does not need to manually optimize their code in most of
the cases - the compiler will optimize the IR:
Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
<img src="src/compiler.png"/>
We can have our own IR too: PaddlePaddle can support model parallel by
converting the IR so the user no longer need to manually do it in
Python:
PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
<img src="src/paddle-compile.png"/>
The IR for PaddlePaddle after refactor is called `Block`, it specifies
the computation dependency graph and the variables used in the
computation.
The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
### Limitation 3
The user can not directly specify the parameter update rule for the
parameter server because the parameter server does not use the same
computation definition as the trainer. Instead, the update rule is
baked in the parameter server. The user can not specify the update
rule in the same way of specifying the trainer computation.
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
This could be fixed by making the parameter server run the same
computation definition as the trainer. For a detailed explanation,
please
see
This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
## Distributed Training Architecture
The new distributed training architecture can address the above
limitations. Below is the illustration:
The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
<img src="src/distributed_architecture.png"/>
The architecture includes major components: *PaddlePaddle Python*,
*PaddlePaddle converter* and *PaddlePaddle runtime*:
The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
### PaddlePaddle Python
PaddlePaddle Python is the Python library that user's Python trainer
invoke to build the neural network topology, start training, etc.
PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
```Python
paddle.init()
......@@ -117,102 +81,60 @@ for i in range(1000):
print cost_val
```
The code above is a typical Python trainer code, the neural network
topology is built using helper functions such as
`paddle.layer.fc`. The training is done by calling `session.eval`
iteratively.
The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
#### session.eval
As shown in the graph, `session.eval` sends the IR and the evaluation
inputs/targets to the PaddlePaddle cluster for evaluation. The
targets can be any variable in the computation graph. When the target
is the `optimizer` variable, the neural network will be optimized
once. When the target is the `cost` variable, `session.eval` returns
the cost value.
As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
The Python `session` is a wrapper of the C++ `Session` class. For more
information about `Session`, please
see [Design Doc: Session](./session.md).
The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
### PaddlePaddle Converter
PaddlePaddle converter automatically converts the IR in the request
(IR and evaluation inputs/targets) from PaddlePaddle Python to new
partitioned IRs and dispatch the new IRs and evaluation inputs/targets
to different PaddlePaddle runtimes. Below are the steps:
The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
fetches the eval targets to the IR.
1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
the boundary. The runtime does not need to run the OP that is not
dependent by the `fetch` OP.
2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
1. Optimizes the computation graph.
3. Optimize the computation graph.
1. Place the OPs in the graph onto different devices on different
PaddlePaddle runtime according to a placement algorithm and device
constraint specified by the user.
4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
1. Partition the graph according to runtime boundaries and add `send` /
`recv` OP pair on the runtime boundaries.
5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
results back to the converter, the convert reports the evaluation
results back to the PaddlePaddle Python.
The output IRs will be cached to optimize the conversion latency.
#### Placement Algorithm
Our first implementation will only support "trainer-parameter server"
placement: the parameters, initializers, and optimizers are placed on
the PaddlePaddle runtimes with the parameter server role. And
everything else will be placed on the PaddlePaddle runtimes with the
trainer role. This has the same functionality of our
"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
is more general and flexible.
Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
In the future, we will implement the general placement algorithm,
which makes placements according to the input IR, and a model of
device computation time and device communication time. Model
parallelism requires the general placement algorithm.
In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
runs the IR. The runtime does not need to do OP placement since it's
already done by the converter.
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
### Local Training Architecture
The local training architecture will be the same as the distributed
training architecture, the differences are everything runs locally,
and there is just one PaddlePaddle runtime:
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
<img src="src/local_architecture.png"/>
### Training Data
In PaddlePaddle v0.10.0, training data is typically read
with [data reader](../reader/README.md) from Python. This approach is
no longer efficient when training distributedly since the Python
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `session.eval`. However should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
## References:
......
从源码编译PaddlePaddle
从源码编译
======================
.. _build_step:
......@@ -7,8 +7,11 @@
----------------
PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
我们推荐您使用PaddlePaddle编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境
我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
编译PaddlePaddle,需要执行:
.. code-block:: bash
......@@ -22,7 +25,6 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
......@@ -31,7 +33,33 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
pip install python/dist/*.whl
.. _build_step:
.. _run_test:
执行单元测试
----------------
如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
如果不使用Docker,可以执行ctest命令即可:
.. code-block:: bash
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
ctest
# 指定执行其中一个单元测试 test_mul_op
ctest -R test_mul_op
.. _compile_deps:
编译依赖
----------------
......
Build PaddlePaddle from Sources
Build from Sources
==========================
.. _build_step:
......@@ -9,14 +9,18 @@ How To Build
PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
tools. We recommend you to use our pre-built Docker image to run the build
to avoid installing dependencies by yourself. We have several build environment
Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_.
Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
If you choose not to use Docker image for your build, you need to install the
below `Compile Dependencies`_ before run the build.
Then run:
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# run the following command to build CPU-Only binaries if you are using docker
# run the following command to build a CPU-Only binaries if you are using docker
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
# else run these commands
mkdir build
......@@ -32,7 +36,35 @@ machine or copy it to the target machine.
pip install python/dist/*.whl
.. _build_step:
.. _run_test:
Run Tests
----------------
If you wish to run the tests, you may follow the below steps:
When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
If you don't use Docker, just run ctest will start the tests:
.. code-block:: bash
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
make
ctest
# run a single test like test_mul_op
ctest -R test_mul_op
.. _compile_deps:
Compile Dependencies
----------------
......
使用Docker安装运行PaddlePaddle
使用Docker安装运行
================================
使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
......
PaddlePaddle in Docker Containers
Run in Docker Containers
=================================
Run PaddlePaddle in Docker container so that you don't need to care about
......
使用pip安装PaddlePaddle
使用pip安装
================================
PaddlePaddle可以使用常用的Python包管理工具
......
Install PaddlePaddle Using pip
Install Using pip
================================
You can use current widely used Python package management
......
......@@ -19,7 +19,6 @@
.. toctree::
:maxdepth: 1
dev/build_cn.rst
dev/write_docs_cn.rst
模型配置
......
......@@ -18,7 +18,6 @@ Development
.. toctree::
:maxdepth: 1
dev/build_en.rst
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md
......
......@@ -13,6 +13,8 @@
limitations under the License. */
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
......@@ -27,11 +29,11 @@
namespace paddle {
namespace framework {
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
std::ostream &operator<<(std::ostream &os, const LoD &lod) {
os << "{";
for (auto& v : lod) {
for (auto &v : lod) {
os << "{";
for (auto& i : v) {
for (auto &i : v) {
os << i << ",";
}
os << "}";
......@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
return os;
}
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
LoD new_lod;
new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) {
......@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
return new_lod;
}
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
size_t elem_end) {
PADDLE_ENFORCE_LT(level, in.size());
PADDLE_ENFORCE_LT(elem_end, in[level].size());
......@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
res[0].assign(in[level].begin() + elem_begin,
in[level].begin() + elem_end + 1);
for (size_t lvl = 1; lvl < res.size(); lvl++) {
const auto& in_level = in[level + lvl];
const auto& above_level = res[lvl - 1];
auto& out_level = res[lvl];
const auto &in_level = in[level + lvl];
const auto &above_level = res[lvl - 1];
auto &out_level = res[lvl];
out_level.assign(in_level.begin() + above_level.front(),
in_level.begin() + above_level.back() + 1);
}
......@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
// to make the first offset equals 0, all the elements minus the first
// element
size_t front = res[lvl].front();
for (auto& ele : res[lvl]) {
for (auto &ele : res[lvl]) {
ele -= front;
}
}
return res;
}
LoD ToAbsOffset(const LoD& in) {
LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in;
LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) {
for (auto& ele : result[level]) {
for (auto &ele : result[level]) {
ele = result[level + 1][ele];
}
}
return result;
}
bool operator==(const LoD& a, const LoD& b) {
bool operator==(const LoD &a, const LoD &b) {
if (a.size() != b.size()) {
return false;
}
for (size_t i = 0; i < a.size(); i++) {
const auto& a_level = a[i];
const auto& b_level = b[i];
const auto &a_level = a[i];
const auto &b_level = b[i];
if (a_level.size() != b_level.size()) {
return false;
}
......@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
}
using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) {
LoD sub_lod;
......@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
return LoDAndOffset{sub_lod, {start_idx, end_idx}};
}
void AppendLoD(LoD* lod, const LoD& lod_length) {
void AppendLoD(LoD *lod, const LoD &lod_length) {
PADDLE_ENFORCE(
lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
......@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
*lod = LoD(lod_length.size(), std::vector<size_t>({0}));
}
for (size_t i = 0; i < lod->size(); ++i) {
auto& level = (*lod)[i];
auto &level = (*lod)[i];
for (size_t len : lod_length[i]) {
level.push_back(level.back() + len);
}
}
}
void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
framework::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod();
uint64_t size = lod.size();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
for (auto &each : lod) {
size = each.size() * sizeof(framework::LoD::value_type::value_type);
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
os.write(reinterpret_cast<const char *>(each.data()),
static_cast<std::streamsize>(size));
}
}
}
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
framework::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case framework::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case framework::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case framework::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case framework::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
is.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
lod[i] = tmp;
}
}
}
} // namespace framework
} // namespace paddle
......@@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
void AppendLoD(LoD* lod, const LoD& lod_length);
/*
* Serialize/Desiralize LoDTensor to std::ostream
* You can pass ofstream or ostringstream to serilize to file
* or to a in memory string. GPU tensor will be copied to CPU.
*/
void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "FactorizationMachineLayer.h"
#include <algorithm>
#include <vector>
#include "paddle/math/SparseMatrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
bool FactorizationMachineLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
factorSize_ = config_.factor_size();
/* initialize the latentVectors_ */
CHECK_EQ(inputLayers_.size(), 1UL);
size_t inputSize = inputLayers_[0]->getSize();
CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
latentVectors_ = std::unique_ptr<Weight>(
new Weight(inputSize, factorSize_, parameters_[0]));
return true;
}
void FactorizationMachineLayer::forward(PassType passType) {
Layer::forward(passType);
const MatrixPtr& inputV = getInputValue(0);
size_t batchSize = inputV->getHeight();
size_t outputSize = getSize();
size_t inputSize = inputLayers_[0]->getSize();
reserveOutput(batchSize, outputSize);
MatrixPtr outV = getOutputValue();
Matrix::resizeOrCreate(
latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
Matrix::resizeOrCreate(
inputMulFactor_, batchSize, factorSize_, false, useGpu_);
Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
inputMulFactor_->mul(*inputV, *latentVectors_->getW());
inputMulFactor_->square2(*tmpOut_);
outV->sumRows(*tmpOut_, 0.5, 0);
if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
Matrix::resizeOrCreateSparseMatrix(inputSquare_,
inputV->getHeight(),
inputV->getWidth(),
inputV->getElementCnt(),
inputV->getValueType());
inputSquare_->copyFrom(*inputV);
(dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
} else {
Matrix::resizeOrCreate(
inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
inputV->square2(*inputSquare_);
}
latentVectors_->getW()->square2(*latentVectorsSquare_);
tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
outV->sumRows(*tmpOut_, -0.5, 1.0);
/* activation */ {
REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
forwardActivation();
}
}
void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
/* Do derivation */ { backwardActivation(); }
const MatrixPtr& inputV = getInputValue(0);
const MatrixPtr& oGrad = getOutputGrad();
Matrix::resizeOrCreate(
tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
latentVectors_->getW()->getHeight(),
1,
false,
useGpu_);
/* Calculate the gradients of the latentVectors_ matrix */
if (latentVectors_->getWGrad()) {
if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
Matrix::resizeOrCreateSparseMatrix(tmpInput_,
inputV->getHeight(),
inputV->getWidth(),
inputV->getElementCnt());
CpuSparseMatrix* sparseInputV =
dynamic_cast<CpuSparseMatrix*>(inputV.get());
CpuSparseMatrix* sparseInputSquare =
dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
CpuSparseMatrix* sparseTmpInput =
dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
sparseTmpInput->copyFrom(*sparseInputV);
sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
latentVectors_->getWGrad()->mul(
*sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
negOnes_->zeroMem();
negOnes_->add(-1);
tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
} else {
Matrix::resizeOrCreate(
tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
tmpInput_->rowScale(0, *inputV, *oGrad);
latentVectors_->getWGrad()->mul(
*tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
tmpInput_->rowScale(0, *inputSquare_, *oGrad);
tmpSum_->sumCols(*tmpInput_, -1, 0);
}
latentVectors_->getWGrad()->addRowScale(
0, *latentVectors_->getW(), *tmpSumTrans);
/* Increasing the number of gradient */
latentVectors_->getParameterPtr()->incUpdate(callback);
}
/* Calculate the input layers gradient */
MatrixPtr inGrad = getInputGrad(0);
if (inGrad != NULL) {
inGrad->mul(
*inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
inGrad->addColScale(0, *inputV, *tmpSum_);
inGrad->rowScale(0, *inGrad, *oGrad);
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/ThreadLocal.h"
namespace paddle {
/**
* @brief The Factorization Machine models pairwise (order-2) feature
* interactions as inner product of the learned latent vectors corresponding
* to each input feature.
*
* The Factorization Machine can effectively capture feature interactions
* especially when the input is sparse. While in principle FM can model higher
* order feature interaction, in practice usually only order-2 feature
* interactions are considered. The Factorization Machine Layer here only
* computes the order-2 interations with the formula:
*
* \f[
* y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
* \f]
*
* The detailed calculation for forward and backward can be found at this paper:
*
* Factorization machines.
*
* The config file api is factorization_machine.
*/
class FactorizationMachineLayer : public Layer {
protected:
// The latent vectors, shape: (size, factorSize_)
// Each row of the latentVectors_ matrix is the latent vector
// corresponding to one input feature dimension
std::unique_ptr<Weight> latentVectors_;
// The hyperparameter that defines the dimensionality of the factorization
size_t factorSize_;
private:
// Store the square values of the letent vectors matrix
MatrixPtr latentVectorsSquare_;
// Store the square values of input matrix
MatrixPtr inputSquare_;
// The result of input matrix * latent vector matrix that will be used in
// both forward and backward step
MatrixPtr inputMulFactor_;
// Store temporary calculation result
MatrixPtr tmpOut_;
MatrixPtr tmpSum_;
MatrixPtr tmpInput_;
// Negative identity matrix
MatrixPtr negOnes_;
public:
explicit FactorizationMachineLayer(const LayerConfig& config)
: Layer(config) {}
~FactorizationMachineLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
} // namespace paddle
......@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
batchSize,
codeLength_,
/* trans */ false,
useGpu(deviceId_));
false);
Matrix::resizeOrCreate(preOutput_.grad,
batchSize,
codeLength_,
/* trans */ false,
useGpu(deviceId_));
false);
IVectorPtr label = getInput(*getLabelLayer()).ids;
preOutput_.value->zeroMem();
if (useGpu_) {
Matrix::resizeOrCreate(cpuOutput_,
output_.value->getHeight(),
output_.value->getWidth(),
/* trans */ false,
false);
IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
cpuLabel_->copyFrom(*label);
cpuOutput_->copyFrom(*output_.value);
} else {
cpuOutput_ = output_.value;
cpuLabel_ = label;
}
/* add the bias-vector */
if (biases_.get() != NULL) {
preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
if (useGpu_) {
Matrix::resizeOrCreate(cpuBias_,
1,
numClasses_ - 1,
/* trans */ false,
false);
cpuBias_->copyFrom(*biases_->getW());
} else {
cpuBias_ = biases_->getW();
}
preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
}
for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
MatrixPtr input = getInputValue(i);
if (useGpu_) {
Matrix::resizeOrCreate(cpuInput_,
input->getHeight(),
input->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeight_,
weights_[i]->getW()->getHeight(),
weights_[i]->getW()->getWidth(),
/* trans */ false,
false);
cpuInput_->copyFrom(*input);
cpuWeight_->copyFrom(*weights_[i]->getW());
} else {
cpuInput_ = input;
cpuWeight_ = weights_[i]->getW();
}
preOutput_.value->mulByBitCode(
numClasses_, *label, *weights_[i]->getW(), *input);
numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
}
// keep consistent with the clipping in the following softrelu
preOutput_.value->clip(-40.0, 40.0);
preOutput_.value->sumByBitCode(numClasses_,
*label,
*output_.value,
*cpuLabel_,
*cpuOutput_,
-1); // scaleSum
preOutput_.value->softrelu(*preOutput_.value);
MatrixPtr sum =
Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
preOutput_.value->rowSum(*sum);
output_.value->add(*sum);
cpuOutput_->add(*sum);
if (useGpu_) {
output_.value->copyFrom(*cpuOutput_);
} else {
output_.value = cpuOutput_;
}
}
void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
IVectorPtr label = getInput(*getLabelLayer()).ids;
if (useGpu_) {
IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
cpuLabel_->copyFrom(*label);
} else {
cpuLabel_ = label;
}
preOutput_.grad->one();
preOutput_.grad->softreluDerivative(*preOutput_.value);
preOutput_.grad->subByBitCode(numClasses_, *label);
preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
if (biases_ && biases_->getWGrad()) {
preOutput_.grad->addByBitCodeBackward(
numClasses_, *label, *biases_->getWGrad());
MatrixPtr biases_grad = biases_->getWGrad();
if (useGpu_) {
Matrix::resizeOrCreate(cpuBias_,
1,
numClasses_ - 1,
/* trans */ false,
false);
cpuBias_->copyFrom(*biases_grad);
} else {
cpuBias_ = biases_grad;
}
preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
if (useGpu_) {
biases_grad->copyFrom(*cpuBias_);
} else {
biases_grad = cpuBias_;
}
/* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback);
}
......@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the W-gradient for the current layer */
MatrixPtr input = getInputValue(i);
if (weights_[i]->getWGrad()) {
MatrixPtr weights_grad = weights_[i]->getWGrad();
if (useGpu_) {
Matrix::resizeOrCreate(cpuInput_,
input->getHeight(),
input->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeightGrad_,
weights_grad->getHeight(),
weights_grad->getWidth(),
/* trans */ false,
false);
cpuInput_->copyFrom(*input);
cpuWeightGrad_->copyFrom(*weights_grad);
} else {
cpuInput_ = input;
cpuWeightGrad_ = weights_grad;
}
preOutput_.grad->mulByBitCodeBackwardWeight(
numClasses_, *label, *weights_[i]->getWGrad(), *input);
numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
if (useGpu_) {
weights_grad->copyFrom(*cpuWeightGrad_);
} else {
weights_grad = cpuWeightGrad_;
}
/* Increasing the number of gradient */
weights_[i]->getParameterPtr()->incUpdate(callback);
}
......@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the input layers error */
MatrixPtr inputGrad = getInputGrad(i);
if (inputGrad) {
if (useGpu_) {
Matrix::resizeOrCreate(cpuInputGrad_,
inputGrad->getHeight(),
inputGrad->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeight_,
weights_[i]->getW()->getHeight(),
weights_[i]->getW()->getWidth(),
/* trans */ false,
false);
cpuInputGrad_->copyFrom(*inputGrad);
cpuWeight_->copyFrom(*weights_[i]->getW());
} else {
cpuInputGrad_ = inputGrad;
cpuWeight_ = weights_[i]->getW();
}
preOutput_.grad->mulByBitCodeBackwardError(
numClasses_, *label, *weights_[i]->getW(), *inputGrad);
numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
if (useGpu_) {
inputGrad->copyFrom(*cpuInputGrad_);
} else {
inputGrad = cpuInputGrad_;
}
}
}
}
......
......@@ -80,6 +80,15 @@ protected:
int codeLength_;
/// temporary result of output_
Argument preOutput_;
/// The temporary variables in CPU memory.
MatrixPtr cpuWeight_;
MatrixPtr cpuWeightGrad_;
MatrixPtr cpuInput_;
MatrixPtr cpuInputGrad_;
MatrixPtr cpuBias_;
MatrixPtr cpuOutput_;
IVectorPtr cpuLabel_;
};
} // namespace paddle
......@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
// Not support GPU now
testLayerGrad(config,
"hsigmoid",
100,
/* trans */ false, /* useGpu */
false);
for (auto useGpu : {false, true}) {
testLayerGrad(config,
"hsigmoid",
100,
/* trans */ false,
/* useGpu */ useGpu);
}
}
TEST(Layer, multi_cross) {
......@@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) {
}
}
void testFactorizationMachineLayer(InputType type, bool useGpu) {
const int FACTOR_SIZE = 10;
TestConfig config;
config.layerConfig.set_type("factorization_machine");
config.layerConfig.set_factor_size(FACTOR_SIZE);
config.layerConfig.set_size(1);
config.biasSize = 0;
config.inputDefs.push_back({type, "layer_0", 128, 1280});
config.layerConfig.add_inputs();
testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
}
TEST(Layer, FactorizationMachineLayer) {
for (auto useGpu : {false, true}) {
testFactorizationMachineLayer(INPUT_DATA, useGpu);
}
testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
......
......@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
os << ";";
}
void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
CHECK(getFormat() != SPARSE_CSC) << "Not supported";
CHECK_EQ(height_, b.getHeight());
CHECK_EQ(width_, b.getWidth());
real* A = getValue();
real* B = b.getValue();
if (b.getValueType() == FLOAT_VALUE) {
for (size_t i = 0; i < height_; i++) {
size_t start = getRowStartIdx(i);
size_t end = getRowStartIdx(i + 1);
CHECK_EQ(start, b.getRowStartIdx(i));
CHECK_EQ(end, b.getRowStartIdx(i + 1));
for (size_t j = start; j < end; j++) {
A[j] = B[j] * c.getElement(i, cCol);
}
}
} else if (b.getValueType() == NO_VALUE) {
for (size_t i = 0; i < height_; i++) {
size_t start = getRowStartIdx(i);
size_t end = getRowStartIdx(i + 1);
CHECK_EQ(start, b.getRowStartIdx(i));
CHECK_EQ(end, b.getRowStartIdx(i + 1));
for (size_t j = start; j < end; j++) {
A[j] = c.getElement(i, cCol);
}
}
}
}
void CpuSparseMatrix::randomizeUniform() {
CHECK_LE(elementCnt_, height_ * width_);
if (valueType_ == FLOAT_VALUE) {
......
......@@ -239,6 +239,15 @@ public:
const unsigned int* cols,
const real* values);
/**
* @brief this_row = b_row * c_row[cCol]
*
* @param[in] cCol the column of matrix c used to scale each row of b
* @param[in] b CpuSparseMatrix
* @param[in] c Matrix
*/
void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
void randomizeUniform();
void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
......
......@@ -205,8 +205,24 @@ set(DEPS_OPS
tensor_array_read_write_op
gru_op
adagrad_op
sgd_op)
sgd_op
save_op
load_op
send_op
recv_op)
add_subdirectory(detail)
op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties(
send_op.cc
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties(
recv_op.cc
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy)
......@@ -235,6 +251,10 @@ op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
......@@ -242,6 +262,8 @@ endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
......@@ -251,3 +273,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
......@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
const auto x_dims = ctx->GetInputDim("X");
const TensorFormat tensor_format =
StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions.");
const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"Input X must have 3 to 5 dimensions.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
......@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5");
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
const int N = x_dims[0];
const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1]
......@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5");
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
const int N = x_dims[0];
const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1]
......
......@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
const TensorFormat &tensor_format, int *N, int *C, int *H,
int *W, int *D) {
*N = dims[0];
*C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
*H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
: 1;
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
*H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
: 1;
}
}
template <typename T>
......@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
// NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5");
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
......@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5");
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
......
......@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
#if CUDNN_VERSION_MIN(7, 0, 0)
#if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
......@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
#if CUDNN_VERSION_MIN(7, 0, 0)
#if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
......
grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "send_recv_impl.h"
namespace paddle {
namespace operators {
namespace detail {
Status SendRecvServerImpl::SendVariable(ServerContext *context,
const VariableMessage *in_var,
VariableMessage *out_var) {
framework::LoDTensor t;
// TODO(typhoonzero): desirealize in_tensor and run pserver network.
std::istringstream iss(in_var->serialized());
framework::DeserializeFromStream(iss, &t);
lodtensor_queue_.Push(std::move(t));
// Block util the sub graph is done.
t = lodtensor_return_queue_.Pop();
std::ostringstream oss;
// FIXME(typhoonzero): get context from op.
framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
std::string *varname = out_var->mutable_varname();
*varname = in_var->varname();
std::string *serialized = out_var->mutable_serialized();
*serialized = oss.str();
return Status::OK;
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "send_recv_impl.h"
namespace paddle {
namespace operators {
namespace detail {
bool RPCClient::SendVariable(const framework::Scope& scope,
const std::string& inname,
const std::string& outname) {
ClientContext context;
VariableMessage msg, out_msg;
// FIXME(typhoonzero): pass device context to here.
auto ctx = platform::CPUDeviceContext();
auto* var = scope.FindVar(inname);
PADDLE_ENFORCE(var);
// TODO(typhoonzero): support SelectedRows
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"Only support LoDTensor, %s has wrong type", inname);
const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
std::ostringstream oss;
framework::SerializeToStream(oss, tensor, ctx);
msg.set_varname(inname);
msg.set_serialized(oss.str());
Status status = stub_->SendVariable(&context, msg, &out_msg);
if (!status.ok()) {
return false;
}
std::istringstream iss(out_msg.serialized());
framework::LoDTensor ret_tensor;
framework::DeserializeFromStream(iss, &ret_tensor);
auto* outvar = scope.FindVar(outname);
framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
// FIXME(typhoonzero): do not copy.
framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
return true;
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto3";
package sendrecv;
service SendRecvService {
// For parameter server round-robin like hashing, do not split tensors.
// Send and recv only one tensor
rpc SendVariable(VariableMessage) returns (VariableMessage) {}
}
// VariableMessage is serialized paddle variable message.
// It can be:
// Tensor
// LoDTensor
// SelectedRows
message VariableMessage {
string varname = 1;
bytes serialized = 2;
}
message VoidMessage {}
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/data_type.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/operators/detail/simple_block_queue.h"
// #include <grpc++/channel.h>
// #include <grpc++/client_context.h>
// #include <grpc++/create_channel.h>
// #include <grpc++/security/credentials.h>
#include "paddle/operators/detail/send_recv.grpc.pb.h"
#include "paddle/operators/detail/send_recv.pb.h"
#include <grpc++/grpc++.h>
using grpc::Channel;
using grpc::Server;
using grpc::ServerContext;
using grpc::ServerReader;
using grpc::ServerBuilder;
using grpc::ClientContext;
using grpc::ClientReader;
using grpc::ClientReaderWriter;
using grpc::ClientWriter;
using grpc::Status;
using sendrecv::SendRecvService;
using sendrecv::VariableMessage;
using sendrecv::VoidMessage;
namespace paddle {
namespace operators {
namespace detail {
class SendRecvServerImpl final : public SendRecvService::Service {
public:
explicit SendRecvServerImpl() {}
Status SendVariable(ServerContext *context, const VariableMessage *in_var,
VariableMessage *out_var) override;
const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
void Push(const framework::LoDTensor &tensor) {
this->lodtensor_return_queue_.Push(tensor);
}
private:
SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
};
// RPCClient is a class to send tensors to pserver sub-network
// using different hashing methods.
class RPCClient {
public:
RPCClient(std::shared_ptr<Channel> channel)
: stub_(SendRecvService::NewStub(channel)) {}
bool SendVariable(const framework::Scope &scope, const std::string &inname,
const std::string &outname);
private:
std::unique_ptr<SendRecvService::Stub> stub_;
};
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <deque>
#include <mutex>
namespace paddle {
namespace operators {
namespace detail {
template <typename T>
class SimpleBlockQueue {
private:
std::mutex mutex_;
std::condition_variable condition_;
std::deque<T> queue_;
public:
void Push(T const& value) {
{
std::unique_lock<std::mutex> lock(this->mutex_);
queue_.push_front(value);
}
this->condition_.notify_one();
}
T Pop() {
std::unique_lock<std::mutex> lock(this->mutex_);
this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
T rc(std::move(this->queue_.back()));
this->queue_.pop_back();
return rc;
}
};
} // namespace detail
} // namespace operators
} // namespace paddle
......@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
out_var_name);
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
uint32_t version;
fin.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
framework::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
fin.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
fin.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(),
std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case framework::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case framework::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case framework::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case framework::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
fin.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
uint64_t lod_level;
fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
fin.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
fin.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
lod[i] = tmp;
}
}
framework::DeserializeFromStream(fin, tensor);
auto place = dev_ctx.GetPlace();
if (platform::is_gpu_place(place)) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/log_loss_op.h"
namespace paddle {
namespace operators {
class LogLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predicted"),
"Input(Predicted) must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) must be initialized.");
auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
PADDLE_ENFORCE_EQ(pred_dims, label_dims);
PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
"The rank of Input(Predicted) must be 2 and the shape is "
"[batch_size, 1].");
PADDLE_ENFORCE_EQ(pred_dims[1], 1,
"Each row of Input(Predicted) contains a real value, "
"so the 2nd dimension of Input(X) must be 1.");
ctx->SetOutputDim("Loss", {pred_dims[0], 1});
ctx->ShareLoD("Predicted", "Loss");
}
};
template <typename AttrType>
class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LogLossOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Predicted",
"The input value (Predicted) of Log loss op."
"Predicted is a 2-D tensor with shape [batch_size, 1].");
AddInput("Labels",
"The target value (Labels) of Log loss op."
"Labels is a 2-D tensor with shape [batch_size, 1].");
AddOutput("Loss",
"The output tensor with shape [batch_size, 1] "
"which represents the log loss.");
AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
AddComment(R"DOC(
LogLoss Operator.
Log loss is a loss function used for binary classification. Log Loss quantifies
the accuracy of a classifier by penalising false classifications. Minimising the
Log Loss is equivalent to maximising the accuracy of the classifier. We define
Predicted as the values predicted by our model and Labels as the target ground
truth value. Log loss can evaluate how close the predicted values are to the
target. The shapes of Predicted and Labels are both [batch_size, 1].
The equation is:
$$
Loss = - Labels * log(Predicted + \epsilon) -
(1 - Labels) * log(1 - Predicted + \epsilon)
$$
)DOC");
}
};
class LogLossGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predicted"),
"Input(Predicted) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
"Output(Predicted@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
auto pred_grad_name = framework::GradVarName("Predicted");
ctx->SetOutputDim(pred_grad_name, pred_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
ops::LogLossGradOp);
REGISTER_OP_CPU_KERNEL(log_loss,
ops::LogLossKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/log_loss_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(log_loss,
ops::LogLossKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T, typename AttrType = T>
class LogLossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* loss_out = ctx.Output<Tensor>("Loss");
loss_out->mutable_data<T>(ctx.GetPlace());
auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
auto loss = EigenVector<T>::Flatten(*loss_out);
auto place = ctx.GetEigenDevice<Place>();
loss.device(place) = (-(label * (prediction + epsilon).log()) -
((static_cast<T>(1) - label) *
(static_cast<T>(1) - prediction + epsilon).log()));
}
};
template <typename Place, typename T, typename AttrType = T>
class LogLossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
auto dl = EigenVector<T>::Flatten(*dloss);
auto place = ctx.GetEigenDevice<Place>();
if (dpred) {
dpred->mutable_data<T>(ctx.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dpred);
dx.device(place) = dl * (-(label / (prediction + epsilon)) +
((static_cast<T>(1) - label) /
(static_cast<T>(1) - prediction + epsilon)));
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream>
#include <thread>
#include <unistd.h>
#include "paddle/framework/data_type.h"
#include "paddle/framework/executor.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h"
namespace paddle {
namespace operators {
void RunServer(Server **rpc_server,
std::shared_ptr<detail::SendRecvServerImpl> service,
const std::string &server_address) {
ServerBuilder builder;
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
builder.RegisterService(service.get());
std::unique_ptr<Server> server(builder.BuildAndStart());
*rpc_server = server.get();
LOG(INFO) << "Server listening on " << server_address << std::endl;
server->Wait();
}
class RecvOp : public framework::OperatorBase {
public:
RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
rpc_service_.reset(new detail::SendRecvServerImpl());
std::string endpoint = Attr<std::string>("endpoint");
server_thread_.reset(
new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
}
}
virtual ~RecvOp() {
rpc_server_->Shutdown();
server_thread_->join();
}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
// blocking get one var from client.
const framework::LoDTensor &t = rpc_service_->Get();
framework::Scope &recv_scope = scope.NewScope();
// set graph input var
auto *var = recv_scope.Var(Input("RX"));
auto *tensor = var->GetMutable<framework::LoDTensor>();
// FIXME(typhoonzero): do not copy
framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
auto *program = block->Program();
framework::Executor executor(dev_ctx);
// Run sub graph to get optimized tensor
executor.Run(*program, &recv_scope, block->ID(),
false /*create_local_scope*/);
auto *out_var = recv_scope.FindVar("Out");
// push back
rpc_service_->Push(out_var->Get<framework::LoDTensor>());
}
protected:
// grpc server instance to track status and gracefully shutdown.
// borrow an pointer from server thread.
Server *rpc_server_{nullptr};
// grpc send/recv service implement to register.
std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
};
class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("RX", "(Tensor) Input tensor to be saved");
AddComment(R"DOC(
Recv operator
This operator will recv tensor from send_op
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
"optimize network run in server");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
......@@ -133,53 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::Tensor>("ROIs");
auto* argmax = ctx.Input<framework::Tensor>("Argmax");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
if (x_grad) {
int channels = in->dims()[1];
auto in_stride = framework::stride(in->dims());
auto roi_stride = framework::stride(rois->dims());
if (in_grad) {
const int64_t* rois_data = rois->data<int64_t>();
int rois_num = rois->dims()[0];
T* x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const int64_t* argmax_data = argmax->data<int64_t>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
math::SetConstant<Place, T> set_zero;
set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
size_t roi_offset = roi_stride[0];
size_t batch_offset = in_stride[0];
size_t channel_offset = in_stride[1];
auto in_stride = framework::stride(in->dims());
auto argmax_stride = framework::stride(argmax->dims());
auto roi_stride = framework::stride(rois->dims());
auto out_stride = framework::stride(out_grad->dims());
const T* out_grad_data = out_grad->data<T>();
size_t pool_channel_offset = pooled_height * pooled_width;
const int64_t* argmax_data = argmax->data<int64_t>();
int rois_num = rois->dims()[0];
int channels = in->dims()[1];
for (size_t n = 0; n < rois_num; ++n) {
size_t roi_batch_idx = rois_data[0];
T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
for (int n = 0; n < rois_num; ++n) {
int roi_batch_idx = rois_data[0];
T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
size_t pool_index = ph * pooled_width + pw;
int pool_index = ph * pooled_width + pw;
if (argmax_data[pool_index] >= 0) {
size_t index = static_cast<size_t>(argmax_data[pool_index]);
auto index = argmax_data[pool_index];
batch_grad_data[index] += out_grad_data[pool_index];
}
}
}
batch_grad_data += channel_offset;
out_grad_data += pool_channel_offset;
argmax_data += pool_channel_offset;
batch_grad_data += in_stride[1];
out_grad_data += out_stride[1];
argmax_data += argmax_stride[1];
}
rois_data += roi_offset;
rois_data += roi_stride[0];
}
}
}
......
......@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
"SaveOp only support LoDTensor, %s has wrong type", iname);
auto &tensor = var->Get<framework::LoDTensor>();
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
framework::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
fout.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
fout.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
fout.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod();
uint64_t size = lod.size();
fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
for (auto &each : lod) {
size = each.size() * sizeof(framework::LoD::value_type::value_type);
fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
fout.write(reinterpret_cast<const char *>(each.data()),
static_cast<std::streamsize>(size));
}
}
framework::SerializeToStream(fout, tensor, dev_ctx);
}
};
......
......@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>,
ops::ScaleKernel<paddle::platform::CPUPlace, double>);
ops::ScaleKernel<paddle::platform::CPUPlace, double>,
ops::ScaleKernel<paddle::platform::CPUPlace, int>,
ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
......@@ -16,4 +16,6 @@
REGISTER_OP_GPU_KERNEL(
scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h"
namespace paddle {
namespace operators {
// TODO(typhoonzero): this is a simple implementation which only send
// one tensor
class SendOp : public framework::OperatorBase {
public:
SendOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
// init client when the operator is created at runtime.
if (!client_) {
std::string endpoint = Attr<std::string>("endpoint");
client_.reset(new detail::RPCClient(
grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
// TODO(typhoonzero): how to call InitVariables
}
}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto iname = Input("X");
auto oname = Output("Out");
// TODO(typhoonzero): currently it's non-blocking,
// should block until server responds.
bool ret = client_->SendVariable(scope, iname, oname);
if (!ret) {
LOG(ERROR) << "send variable error";
}
}
protected:
std::shared_ptr<detail::RPCClient> client_{nullptr};
};
class SendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) Input tensor to be saved");
AddOutput("Out", "(Tensor) Output fetched from server");
AddComment(R"DOC(
Recv operator
This operator will recv tensor from send_op
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// TODO(typhoonzero): add python bindings for this test as
// a RemoteOptimizer.
#include <unistd.h>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h"
USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(recv);
USE_OP(sum);
// global for simplicity.
std::unique_ptr<paddle::framework::OperatorBase> recv_op;
void InitTensorsInScope(paddle::framework::Scope &scope,
paddle::platform::CPUPlace &place) {
paddle::platform::CPUDeviceContext ctx(place);
auto var = scope.Var("X");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({10, 10});
float *expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<float>(i);
}
auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
out_tensor->Resize({10, 10});
tensor->mutable_data<float>(place); // allocate
}
void AddOp(const std::string &type,
const paddle::framework::VariableNameMap &inputs,
const paddle::framework::VariableNameMap &outputs,
paddle::framework::AttributeMap attrs,
paddle::framework::BlockDescBind *block) {
// insert output
for (auto kv : outputs) {
for (auto v : kv.second) {
auto var = block->Var(v);
var->SetDataType(paddle::framework::DataType::FP32);
}
}
// insert op
auto op = block->AppendOp();
op->SetType(type);
for (auto &kv : inputs) {
op->SetInput(kv.first, kv.second);
}
for (auto &kv : outputs) {
op->SetOutput(kv.first, kv.second);
}
op->SetAttrMap(attrs);
}
void StartServerNet() {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
InitTensorsInScope(scope, place);
// sub program run in recv_op, for simple test we use sum
paddle::framework::ProgramDescBind program;
paddle::framework::BlockDescBind *block = program.MutableBlock(0);
// X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
paddle::framework::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
attrs.insert({"OptimizeBlock", block});
recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
{{"Out", {"Out"}}}, attrs);
paddle::platform::CPUDeviceContext ctx(place);
recv_op->Run(scope, ctx);
}
TEST(SendRecvOp, CPU) {
std::thread server_thread(StartServerNet);
sleep(5); // wait server to start
// local net
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
InitTensorsInScope(scope, place);
paddle::framework::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
auto send_op = paddle::framework::OpRegistry::CreateOp(
"send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
paddle::platform::CPUDeviceContext ctx(place);
send_op->Run(scope, ctx);
auto in_var = scope.Var("X");
auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
float *expected = tensor->data<float>();
auto out_var = scope.Var("Out");
auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
// send fail cause output is none.
EXPECT_NE(target->memory_size(), size_t(0));
float *actual = target->data<float>();
for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]);
}
recv_op.reset(); // dtor can shutdown and join server thread.
server_thread.join();
}
......@@ -55,7 +55,7 @@ SGD operator
This operator implements one step of the stochastic gradient descent algorithm.
$$param_out = param - learning_rate * grad$$
$$param\_out = param - learning\_rate * grad$$
)DOC");
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace paddle {
namespace platform {
void CudaProfilerInit(std::string output_file, std::string output_mode,
std::vector<std::string> config_flags) {
std::array<char, 128> buf;
std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
memcpy(buf.data(), tmpl.data(), tmpl.size());
auto result = mktemp(buf.data());
PADDLE_ENFORCE(strlen(result) != 0);
std::string config_file = result;
{
std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
for (const auto& line : config_flags) {
ofs << line << std::endl;
}
}
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE(
cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
}
void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
} // namespace platform
} // namespace paddle
......@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -37,6 +37,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/operators/nccl/nccl_gpu_common.h"
#include "paddle/platform/cuda_profiler.h"
#include "paddle/platform/gpu_info.h"
#endif
......@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("op_support_gpu", OpSupportGPU);
#ifdef PADDLE_WITH_CUDA
m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
m.def("nvprof_init", platform::CudaProfilerInit);
m.def("nvprof_start", platform::CudaProfilerStart);
m.def("nvprof_stop", platform::CudaProfilerStop);
#endif
return m.ptr();
......
......@@ -16,11 +16,13 @@ function cmake_gen() {
echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
elif [ "$1" == "cp27-cp27mu" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
......
......@@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py
make -j `nproc` paddle_docs paddle_docs_cn
# check websites for broken links
linkchecker doc/en/html/index.html
linkchecker doc/cn/html/index.html
# It will be failed now!
#linkchecker doc/en/html/index.html
#linkchecker doc/cn/html/index.html
# Parse Github URL
REPO=`git config remote.origin.url`
......
......@@ -544,6 +544,9 @@ message LayerConfig {
// for batch normalization layer
// The small constant added to the variance to improve numeric stability.
optional double epsilon = 60 [ default = 0.00001 ];
// for factorization machine layer
optional uint32 factor_size = 61;
}
message EvaluatorConfig {
......
......@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase):
image_conf.img_size_y = input_layer.height
image_conf.channels = input_layer.size / (input_layer.width *
input_layer.height)
# only support for 4-dims inputs and NCHW order
if (len(self.config.inputs) == 2):
self.set_layer_height_width(
self.get_input_layer(1).height, self.get_input_layer(1).width)
self.set_layer_size(self.get_input_layer(1).size)
else:
self.set_layer_height_width(shape[-2], shape[-1])
self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
@config_layer('batch_norm')
......@@ -3849,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
name, 'switch_order', 0, inputs=inputs, **xargs)
self.config.reshape_conf.height_axis.extend(reshape['height'])
self.config.reshape_conf.width_axis.extend(reshape['width'])
input_layer = self.get_input_layer(0)
if reshape is None:
self.set_layer_size(input_layer.size)
else:
in_h = input_layer.height
in_w = input_layer.width
out_dims = None
if input_layer.has_depth():
in_d = input_layer.depth
in_c = input_layer.size / in_h / in_w / in_d
# batch_size, depth, height, width, channel
out_dims = [0, in_d, in_h, in_w, in_c]
else:
in_c = input_layer.size / in_h / in_w
# batch_size, height, width, channel
out_dims = [0, in_h, in_w, in_c]
# Because (reshape['width'][0] > 0) always be true.
# So out_dims[0] won't be used.
size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
self.set_layer_size(size)
@config_layer('scale_sub_region')
......@@ -3870,6 +3898,21 @@ class ScaleSubRegionLayer(LayerBase):
image_conf.channels)
@config_layer('factorization_machine')
class FactorizationMachineLayer(LayerBase):
def __init__(self, name, inputs, factor_size, **xargs):
super(FactorizationMachineLayer, self).__init__(
name, 'factorization_machine', size=1, inputs=inputs, **xargs)
config_assert(
len(self.inputs) == 1,
'factorization machine layer must have one and only one input.')
self.config.factor_size = factor_size
input_layer = self.get_input_layer(0)
psize = input_layer.size * factor_size
dims = [input_layer.size, factor_size]
self.create_input_parameter(0, psize, dims)
# Deprecated, use a new layer specific class instead
@config_func
def Layer(name, type, **xargs):
......
......@@ -148,6 +148,7 @@ __all__ = [
'resize_layer',
'sub_seq_layer',
'scale_sub_region_layer',
'factorization_machine',
]
......@@ -264,6 +265,8 @@ class LayerType(object):
SCALE_SUB_REGION_LAYER = 'scale_sub_region'
FACTORIZATION_MACHINE = 'factorization_machine'
@staticmethod
def is_layer_type(type_name):
"""
......@@ -2985,8 +2988,8 @@ def spp_layer(input,
A layer performs spatial pyramid pooling.
Reference:
Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
https://arxiv.org/abs/1406.4729
`Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
https://arxiv.org/abs/1406.4729`_
The example usage is:
......@@ -3087,8 +3090,8 @@ def img_cmrnorm_layer(input,
Response normalization across feature maps.
Reference:
ImageNet Classification with Deep Convolutional Neural Networks
http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
`ImageNet Classification with Deep Convolutional Neural Networks
http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
The example usage is:
......@@ -3154,9 +3157,9 @@ def batch_norm_layer(input,
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
Reference:
Batch Normalization: Accelerating Deep Network Training by Reducing
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift
http://arxiv.org/abs/1502.03167
http://arxiv.org/abs/1502.03167`_
The example usage is:
......@@ -5413,10 +5416,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
to be devided by groups.
Reference:
Maxout Networks
http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
https://arxiv.org/pdf/1312.6082v4.pdf
`Maxout Networks
http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
`Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
https://arxiv.org/pdf/1312.6082v4.pdf`_
.. math::
y_{si+j} = \max_k x_{gsi + sk + j}
......@@ -5481,9 +5484,9 @@ def ctc_layer(input,
alignment between the inputs and the target labels is unknown.
Reference:
Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
`Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
with Recurrent Neural Networks
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
Note:
Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
......@@ -5555,9 +5558,9 @@ def warp_ctc_layer(input,
install it to :code:`third_party/install/warpctc` directory.
Reference:
Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
`Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
with Recurrent Neural Networks
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
Note:
- Let num_classes represents the category number. Considering the 'blank'
......@@ -5777,8 +5780,8 @@ def nce_layer(input,
Noise-contrastive estimation.
Reference:
A fast and simple algorithm for training neural probabilistic language
models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
`A fast and simple algorithm for training neural probabilistic language
models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
The example usage is:
......@@ -5893,8 +5896,8 @@ def rank_cost(left,
A cost Layer for learning to rank using gradient descent.
Reference:
Learning to Rank using Gradient Descent
http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf
`Learning to Rank using Gradient Descent
http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
.. math::
......@@ -6429,8 +6432,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if} \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
Reference:
Fast R-CNN
https://arxiv.org/pdf/1504.08083v2.pdf
`Fast R-CNN
https://arxiv.org/pdf/1504.08083v2.pdf`_
The example usage is:
......@@ -6636,8 +6639,8 @@ def prelu_layer(input,
The Parametric Relu activation that actives outputs with a learnable weight.
Reference:
Delving Deep into Rectifiers: Surpassing Human-Level Performance on
ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
`Delving Deep into Rectifiers: Surpassing Human-Level Performance on
ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
.. math::
z_i &\\quad if \\quad z_i > 0 \\\\
......@@ -6733,8 +6736,8 @@ def gated_unit_layer(input,
product between :match:`X'` and :math:`\sigma` is finally returned.
Reference:
Language Modeling with Gated Convolutional Networks
https://arxiv.org/abs/1612.08083
`Language Modeling with Gated Convolutional Networks
https://arxiv.org/abs/1612.08083`_
.. math::
y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
......@@ -6870,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
:param input: The input of this layer. If two inputs are given, the second one
will be regarded as the reference.
And the input must be 4-dims and in NCHW order.
:type input: LayerOutput | Sequence
:param offset: The crop offset.
:type offset: Sequence
......@@ -7403,3 +7407,73 @@ def scale_sub_region_layer(input, indices, value, name=None):
parents=[input, indices],
num_filters=input.num_filters,
size=input.size)
@wrap_name_default()
@wrap_act_default(act=LinearActivation())
@wrap_param_attr_default()
@layer_support()
def factorization_machine(input,
factor_size,
act=None,
name=None,
param_attr=None,
layer_attr=None):
"""
The Factorization Machine models pairwise feature interactions as inner
product of the learned latent vectors corresponding to each input feature.
The Factorization Machine can effectively capture feature interactions
especially when the input is sparse.
This implementation only consider the 2-order feature interactions using
Factorization Machine with the formula:
.. math::
y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
Note:
X is the input vector with size n. V is the factor matrix. Each row of V
is the latent vector corresponding to each input dimesion. The size of
each latent vector is k.
For details of Factorization Machine, please refer to the paper:
Factorization machines.
.. code-block:: python
first_order = paddle.layer.fc(input=input,
size=1,
act=paddle.activation.Linear())
second_order = paddle.layer.factorization_machine(input=input,
factor_size=10)
fm = paddle.layer.addto(input=[first_order, second_order],
act=paddle.activation.Linear(),
bias_attr=False)
:param input: The input layer. Supported input types: all input data types
on CPU, and only dense input types on GPU.
:type input: LayerOutput
:param factor_size: The hyperparameter that defines the dimensionality of
the latent vector size.
:type context_len: int
:param act: Activation Type. Default is linear activation.
:type act: BaseActivation
:param param_attr: The parameter attribute. See ParameterAttribute for
details.
:type param_attr: ParameterAttribute
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(input, LayerOutput)
assert factor_size > 0, "the factor_size must be greater than 0."
Layer(
inputs=[Input(input.name, **param_attr.attr)],
name=name,
factor_size=factor_size,
type=LayerType.FACTORIZATION_MACHINE,
active_type=act.name,
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
......@@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer)
test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
test_factorization_machine)
export whole_configs=(test_split_datasource)
type: "nn"
layers {
name: "data"
type: "data"
size: 1024
active_type: ""
}
layers {
name: "__factorization_machine_0__"
type: "factorization_machine"
size: 1
active_type: ""
inputs {
input_layer_name: "data"
input_parameter_name: "___factorization_machine_0__.w0"
}
factor_size: 10
}
parameters {
name: "___factorization_machine_0__.w0"
size: 10240
initial_mean: 0.0
initial_std: 0.03125
dims: 1024
dims: 10
initial_strategy: 0
initial_smart: true
}
input_layer_names: "data"
output_layer_names: "__factorization_machine_0__"
sub_models {
name: "root"
layer_names: "data"
layer_names: "__factorization_machine_0__"
input_layer_names: "data"
output_layer_names: "__factorization_machine_0__"
is_recurrent_layer_group: false
}
from paddle.trainer_config_helpers import *
data = data_layer(name='data', size=1024)
fm = factorization_machine(input=data, factor_size=10)
outputs(fm)
......@@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count):
'''Get the number of physical cores'''
if platform.system() == "Linux":
num_sockets = int(
os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs")
os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
.read())
num_cores_per_socket = int(
os.popen(
"lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs")
os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
.read())
return num_sockets * num_cores_per_socket
else:
......
......@@ -26,9 +26,9 @@ class Evaluator(object):
name(str): The name of evaluator. such as, "accuracy". Used for generate
temporary variable name.
main_program(Program, optional): The evaluator should be added to this
main_program. Default g_main_program
main_program. Default default_main_program()
startup_program(Program, optional):The parameter should be added to this
startup_program. Default g_startup_program
startup_program. Default default_startup_program()
Attributes:
states(list): The list of state variables. states will be reset to zero
......
import numpy as np
from . import core
from framework import Program, g_main_program
from framework import Program, default_main_program
__all__ = ['Executor', 'g_scope']
......@@ -103,7 +103,7 @@ class Executor(object):
fetch_list = []
if program is None:
program = g_main_program
program = default_main_program()
if not isinstance(program, Program):
raise TypeError()
......
......@@ -6,7 +6,7 @@ import proto.framework_pb2 as framework_pb2
__all__ = [
'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
'default_main_program', 'g_startup_program', 'g_main_program'
'default_main_program'
]
......@@ -654,13 +654,13 @@ class Parameter(Variable):
# program is a global instance.
g_main_program = Program()
g_startup_program = Program()
_main_program_ = Program()
_startup_program_ = Program()
def default_startup_program():
return g_startup_program
return _startup_program_
def default_main_program():
return g_main_program
return _main_program_
import os
import cPickle as pickle
from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
Variable
from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
__all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
......@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
"""
if vars is None:
if main_program is None:
main_program = g_main_program
main_program = default_main_program()
if not isinstance(main_program, Program):
raise TypeError("program should be as Program type or None")
......@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
:param executor: executor that save variable
:param dirname: directory path
:param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default g_program.
program which fit `predicate`. Default default_main_program().
:param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be loaded.
:param vars: variables need to be loaded. If specify vars, program &
......@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
"""
if vars is None:
if main_program is None:
main_program = g_main_program
main_program = default_main_program()
if not isinstance(main_program, Program):
raise TypeError("program's type should be Program")
......@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
def get_inference_program(target_vars, main_program=None):
if main_program is None:
main_program = g_main_program
main_program = default_main_program()
if not isinstance(target_vars, list):
target_vars = [target_vars]
......@@ -177,12 +176,12 @@ def save_inference_model(dirname,
:param target_vars: Variables from which we can get inference results.
:param executor: executor that save inference model
:param main_program: original program, which will be pruned to build the inference model.
Default g_main_program.
Default default_main_program().
:return: None
"""
if main_program is None:
main_program = g_main_program
main_program = default_main_program()
if not isinstance(target_vars, list):
target_vars = [target_vars]
......@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
:param executor: executor for retrieving the value
:param name: the name of the parameter
:param program: the program where the variable is found
Default g_main_program.
Default default_main_program().
:return: the LoDTensor for the variable
"""
if program is None:
program = g_main_program
program = default_main_program()
var = program.global_block().var(name)
return get_parameter_value(var, executor)
import copy
import itertools
from framework import Variable, g_main_program, \
g_startup_program, unique_name, dtype_is_floating
from framework import Variable, default_main_program, default_startup_program, unique_name, dtype_is_floating
from paddle.v2.fluid.initializer import Constant, Xavier
......@@ -22,7 +21,7 @@ class LayerHelper(object):
def main_program(self):
prog = self.kwargs.get('main_program', None)
if prog is None:
return g_main_program
return default_main_program()
else:
return prog
......@@ -30,7 +29,7 @@ class LayerHelper(object):
def startup_program(self):
prog = self.kwargs.get('startup_program', None)
if prog is None:
return g_startup_program
return default_startup_program()
else:
return prog
......
from . import core
import core
import proto.framework_pb2 as framework_pb2
from framework import OpProtoHolder, Variable, Program, Operator
from initializer import Constant, Normal, Xavier
from initializer import Constant, Normal, Xavier, Initializer
from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
import re
import cStringIO
......@@ -1587,6 +1587,97 @@ def array_length(array, main_program=None):
return tmp
def conv2d_transpose(input,
num_filters,
output_size=None,
filter_size=None,
padding=None,
stride=None,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
"""
The transpose of conv2d layer.
This layer is also known as deconvolution layer.
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
output_size(int|tuple|None): The output image size. If output size is a
tuple, it must contain two integers, (image_H, image_W). This
parameter only works when filter_size is None.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square. None if use output size to
calculate filter_size
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride.
param_attr: Parameter Attribute.
param_initializer(Initializer): Parameter Initializer. Default is Xavier
main_program(Program): the main program
startup_program(Program): the startup program
Returns:
Variable: Output image.
"""
helper = LayerHelper("conv2d_transpose", **locals())
if not isinstance(input, Variable):
raise TypeError("Input of conv2d_transpose must be Variable")
input_channel = input.shape[1]
op_attr = dict()
if isinstance(padding, int):
op_attr['paddings'] = [padding, padding]
elif padding is not None:
op_attr['paddings'] = padding
if isinstance(stride, int):
op_attr['strides'] = stride
elif stride is not None:
op_attr['strides'] = stride
if filter_size is None:
if output_size is None:
raise ValueError("output_size must be set when filter_size is None")
if isinstance(output_size, int):
output_size = [output_size, output_size]
padding = op_attr.get('paddings', [0, 0])
stride = op_attr.get('strides', [1, 1])
h_in = input.shape[2]
w_in = input.shape[3]
filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
filter_size = [filter_size_h, filter_size_w]
elif isinstance(filter_size, int):
filter_size = [filter_size, filter_size]
filter_shape = [input_channel, num_filters] + filter_size
img_filter = helper.create_parameter(
dtype=input.dtype,
shape=filter_shape,
attr=helper.param_attr,
initializer=param_initializer)
out = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
type='conv2d_transpose',
inputs={'Input': [input],
'Filter': [img_filter]},
outputs={'Output': out},
attrs=op_attr)
return out
class ConditionalBlockGuard(BlockGuard):
def __init__(self, block):
if not isinstance(block, ConditionalBlock):
......
import paddle.v2.fluid.core as core
from contextlib import contextmanager
__all__ = ['CudaProfiler']
NVPROF_CONFIG = [
"gpustarttimestamp",
"gpuendtimestamp",
"gridsize3d",
"threadblocksize",
"streamid",
"enableonstart 0",
"conckerneltrace",
]
@contextmanager
def cuda_profiler(output_file, output_mode=None, config=None):
"""The CUDA profiler.
This fuctions is used to profile CUDA program by CUDA runtime application
programming interface. The profiling result will be written into
`output_file` with Key-Value pair format or Comma separated values format.
The user can set the output mode by `output_mode` argument and set the
counters/options for profiling by `config` argument. The default config
caontains 'gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'.
Args:
output_file (string) : The output file name, the result will be
written into this file.
output_mode (string) : The output mode has Key-Value pair format and
Comma separated values format. It should be 'kv' or 'csv'.
config (string) : The profiler options and counters can refer to
"Compute Command Line Profiler User Guide".
"""
if output_mode is None:
output_mode = 'csv'
if output_mode not in ['kv', 'csv']:
raise ValueError("The output mode must be 'key-value' or 'csv'.")
config = NVPROF_CONFIG if config is None else config
core.nvprof_init(output_file, output_mode, config)
# Enables profiler collection by the active CUDA profiling tool.
core.nvprof_start()
yield
# Disables profiler collection.
core.nvprof_stop()
......@@ -69,8 +69,7 @@ def vgg16_bn_drop(input):
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
bn = fluid.layers.batch_norm(input=reshape1, act='relu')
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
......
......@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.fluid.framework import g_main_program
from paddle.v2.fluid.framework import default_main_program
import numpy
......@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase):
append_backward_ops(total_sum_scaled)
g_vars = map(g_main_program.global_block().var,
g_vars = map(default_main_program().global_block().var,
[each_x.name + "@GRAD" for each_x in x])
g_out = [
item.sum()
......
......@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
if data_format == "NCHW":
n, c, h, w = x.shape
x_square = x * x
......@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
offset_tile = np.reshape(offset, (1, c, 1, 1))
offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
y = normalized * scale_tile + offset_tile
if len(x_shape) == 2:
y = np.reshape(y, (y.shape[0], y.shape[1]))
return y, mean, var
elif data_format == "NHWC":
x_square = x * x
......@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
mean = x_sum / element_count
var = x_square_sum / element_count - mean * mean
normalized = (x - mean) / np.sqrt(var + epsilon)
return (normalized * scale + offset), mean, var
y = normalized * scale + offset
if len(x_shape) == 2:
y = np.reshape(y, x_shape)
return y, mean, var
else:
raise ValueError("Unknown data order.")
......@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
# (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
# transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], grad_y.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], 1, 1, grad_y.shape[1]))
if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
grad_y = np.transpose(grad_y, (0, 2, 3, 1))
......@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
grad_x = np.transpose(grad_x, (0, 3, 1, 2))
x = np.transpose(x, (0, 3, 1, 2))
grad_y = np.transpose(grad_y, (0, 3, 1, 2))
if len(x_shape) == 2:
grad_x = np.reshape(grad_x, x_shape)
return grad_x, grad_scale, grad_offset
......@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
momentum = 0.9
# N, H, W, C: 2, 3, 4, 2
n, h, w, c = 2, 3, 4, 2
n, h, w, c = 2, 3, 4, 5
x_shape = [n, h, w, c]
scale_shape = [c]
......@@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest):
print 'python: NHWC, NCHW, backward checking passed'
def test_forward_backward(self):
def test_with_place(place, tensor_format):
def test_with_place(place, tensor_format, shape):
# attr
epsilon = 0.00001
momentum = 0.9
# N, H, W, C: 12, 3, 4, 2
n, h, w, c = 2, 3, 4, 2
if data_format == "NHWC":
x_shape = [n, h, w, c]
elif data_format == "NCHW":
x_shape = [n, c, h, w]
if len(shape) == 2:
x_shape = shape
c = shape[1]
else:
raise ValueError("Unknown data type.")
# n, h, w, c = 2, 3, 4, 2
n, h, w, c = shape[0], shape[1], shape[2], shape[3]
if data_format == "NHWC":
x_shape = [n, h, w, c]
elif data_format == "NCHW":
x_shape = [n, c, h, w]
else:
raise ValueError("Unknown data type.")
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(np.float32)
......@@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest):
# for gradient test
# y_grad = np.ones(x_shape).astype(np.float32)
y_grad = np.zeros(x_shape).astype(np.float32)
y_grad[0, 0, 0, 0] = 1.
if len(y_grad.shape) == 2:
y_grad[0, 0] = 1.
else:
y_grad[0, 0, 0, 0] = 1.
# y_grad = np.random.random_sample(x_shape).astype(np.float32)
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
......@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
places.append(core.GPUPlace(0))
for place in places:
for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format)
test_with_place(place, data_format, [2, 3, 4, 5])
test_with_place(place, data_format, [2, 3])
if __name__ == '__main__':
......
import unittest
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import g_startup_program, g_main_program
from paddle.v2.fluid.framework import default_startup_program, default_main_program
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops
import numpy
......@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase):
cpu = core.CPUPlace()
exe = Executor(cpu)
exe.run(g_startup_program)
exe.run(default_startup_program())
x = numpy.random.random(size=(10, 1)).astype('float32')
......@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase):
append_backward_ops(loss=loss)
outs = exe.run(
feed={'X': x},
fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0]
fetch_list=[
default_main_program().block(0).var(data.name + "@GRAD")
])[0]
print outs
......
import unittest
from paddle.v2.fluid.layers import mul, data, sequence_pool
import numpy
import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import g_main_program
import numpy
from paddle.v2.fluid.layers import mul, data
class TestExecutor(unittest.TestCase):
......@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase):
a_np = numpy.random.random((100, 784)).astype('float32')
b_np = numpy.random.random((784, 100)).astype('float32')
exe = Executor(place)
outs = exe.run(g_main_program,
feed={'a': a_np,
'b': b_np},
fetch_list=[out])
outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
out = outs[0]
self.assertEqual((100, 100), out.shape)
self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
......
import unittest
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid as fluid
import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program
......@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
def test_batch_norm_layer(self):
main_program = Program()
startup_program = Program()
images = layers.data(
images = fluid.layers.data(
name='pixel',
shape=[3, 48, 48],
dtype='float32',
main_program=main_program)
layers.batch_norm(
hidden1 = fluid.layers.batch_norm(
input=images,
main_program=main_program,
startup_program=startup_program)
hidden2 = fluid.layers.fc(input=hidden1,
size=128,
act='relu',
main_program=main_program)
hidden3 = fluid.layers.batch_norm(
input=hidden2,
main_program=main_program,
startup_program=startup_program)
# print str(main_program)
print str(main_program)
def test_dropout_layer(self):
main_program = Program()
startup_program = Program()
images = layers.data(
images = fluid.layers.data(
name='pixel',
shape=[3, 48, 48],
dtype='float32',
main_program=main_program)
layers.dropout(
fluid.layers.dropout(
x=images,
dropout_prob=0.5,
main_program=main_program,
......@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
main_program = Program()
startup_program = Program()
images = layers.data(
images = fluid.layers.data(
name='pixel',
shape=[3, 48, 48],
dtype='float32',
......@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
def test_elementwise_add_with_act(self):
main_program = Program()
startup_program = Program()
image1 = layers.data(
image1 = fluid.layers.data(
name='pixel1',
shape=[3, 48, 48],
dtype='float32',
main_program=main_program,
startup_program=startup_program)
image2 = layers.data(
image2 = fluid.layers.data(
name='pixel2',
shape=[3, 48, 48],
dtype='float32',
main_program=main_program,
startup_program=startup_program)
out = layers.elementwise_add(
out = fluid.layers.elementwise_add(
x=image1,
y=image2,
act='relu',
......
......@@ -65,6 +65,15 @@ class TestBook(unittest.TestCase):
print str(program)
def test_conv2d_transpose(self):
program = Program()
kwargs = {'main_program': program}
img = layers.data(
name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs)
layers.conv2d_transpose(
input=img, num_filters=10, output_size=28, **kwargs)
print str(program)
def test_recognize_digits_conv(self):
program = Program()
......
from paddle.v2.fluid.layers import lod_rank_table, data
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import g_main_program
import paddle.v2.fluid.core as core
import numpy
import unittest
......@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
tensor = core.LoDTensor()
tensor.set(numpy.random.random(size=(17, 100)), cpu)
tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
exe.run(g_main_program, scope=scope, feed={'x': tensor})
exe.run(scope=scope, feed={'x': tensor})
var = scope.find_var(rank_table.name)
table = var.get_lod_rank_table()
self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
......
import unittest
import numpy as np
from op_test import OpTest
class TestLogLossOp(OpTest):
def setUp(self):
self.op_type = 'log_loss'
samples_num = 32
predicted = np.random.uniform(0.1, 1.0,
(samples_num, 1)).astype("float32")
labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
epsilon = 1e-4
self.inputs = {
'Predicted': predicted,
'Labels': labels,
}
self.attrs = {'epsilon': epsilon}
loss = -labels * np.log(predicted + epsilon) - (
1 - labels) * np.log(1 - predicted + epsilon)
self.outputs = {'Loss': loss}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
if __name__ == '__main__':
unittest.main()
import unittest
from paddle.v2.fluid.framework import Variable, Program, g_main_program
import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import Program, default_startup_program
main_program = default_startup_program()
class TestOperator(unittest.TestCase):
def test_error_type(self):
block = g_main_program.create_block()
block = main_program.create_block()
try:
block.append_op()
self.assertFail()
......
import unittest
from paddle.v2.fluid.framework import g_main_program
from paddle.v2.fluid.framework import default_main_program
import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.io as io
from paddle.v2.fluid.initializer import ConstantInitializer
import numpy as np
main_program = default_main_program()
class TestParameter(unittest.TestCase):
def test_param(self):
shape = [784, 100]
val = 1.0625
b = g_main_program.global_block()
b = main_program.global_block()
param = b.create_parameter(
name='fc.w',
shape=shape,
......@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
self.assertEqual(core.DataType.FP32, param.dtype)
self.assertEqual(0, param.block.idx)
exe = Executor(core.CPUPlace())
p = exe.run(g_main_program, fetch_list=[param])[0]
p = exe.run(main_program, fetch_list=[param])[0]
self.assertTrue(np.allclose(p, np.ones(shape) * val))
p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
p = io.get_parameter_value_by_name('fc.w', exe, main_program)
self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
......
import unittest
import numpy as np
import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers
class TestProfiler(unittest.TestCase):
def test_nvprof(self):
if not fluid.core.is_compile_gpu():
return
epoc = 8
dshape = [4, 3, 28, 28]
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.GPUPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype("float32")
exe.run(fluid.default_main_program(), feed={'data': input})
if __name__ == '__main__':
unittest.main()
from __future__ import print_function
import unittest
from paddle.v2.fluid.framework import Program
from paddle.v2.fluid.framework import g_main_program
from paddle.v2.fluid.framework import Program, default_main_program
import paddle.v2.fluid.layers as layers
main_program = default_main_program()
class TestProgram(unittest.TestCase):
def test_program(self):
b = g_main_program.current_block()
b = main_program.current_block()
self.assertEqual(-1, b.parent_idx)
self.assertEqual(0, b.idx)
b = g_main_program.create_block()
b = main_program.create_block()
self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx)
b = g_main_program.create_block()
b = main_program.create_block()
self.assertEqual(2, b.idx)
self.assertEqual(1, b.parent_idx)
g_main_program.rollback()
main_program.rollback()
b = g_main_program.current_block()
b = main_program.current_block()
self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx)
b = g_main_program.create_block()
b = main_program.create_block()
self.assertEqual(3, b.idx)
self.assertEqual(1, b.parent_idx)
g_main_program.rollback()
b = g_main_program.current_block()
main_program.rollback()
b = main_program.current_block()
self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx)
......
......@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.fluid.framework import g_main_program
from paddle.v2.fluid.framework import default_main_program
import numpy
main_program = default_main_program()
class TestShrinkRNNMemory(unittest.TestCase):
def test_shrink_rnn_memory(self):
......@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
append_backward_ops(loss=mem3_mean)
x_grad = exe.run(
feed={'x': tensor},
fetch_list=[g_main_program.global_block().var('x@GRAD')])[0]
fetch_list=[main_program.global_block().var('x@GRAD')])[0]
self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
......
import unittest
from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_
from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
import paddle.v2.fluid.core as core
import numpy as np
......@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
self.assertRaises(ValueError, lambda: convert("int8"))
def test_var(self):
b = g_main_program.current_block()
b = default_main_program().current_block()
w = b.create_var(
dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
self.assertNotEqual(str(w), "")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册