提交 fc4bcdd7 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into prepare_pserver_executor

repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git - repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1 sha: v1.0.1
hooks: hooks:
...@@ -25,6 +26,14 @@ ...@@ -25,6 +26,14 @@
entry: bash ./.clang_format.hook -i entry: bash ./.clang_format.hook -i
language: system language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: local
hooks:
- id: cpplint-cpp-source
name: cpplint
description: Check C++ code style using cpplint.py.
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:
......
...@@ -34,7 +34,7 @@ addons: ...@@ -34,7 +34,7 @@ addons:
- automake - automake
- libtool - libtool
- ccache - ccache
ssh_known_hosts: 52.76.173.135 ssh_known_hosts: 13.229.163.131
before_install: before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
......
...@@ -53,8 +53,7 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) ...@@ -53,8 +53,7 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
...@@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON) ...@@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
endif() endif()
if (WITH_C_API) if (WITH_C_API)
set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE) set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
endif() endif()
if(MOBILE_INFERENCE) if(MOBILE_INFERENCE)
...@@ -147,6 +146,7 @@ include(external/cares) ...@@ -147,6 +146,7 @@ include(external/cares)
include(external/grpc) include(external/grpc)
include(external/snappy) # download snappy include(external/snappy) # download snappy
include(external/snappystream) include(external/snappystream)
include(external/threadpool)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
include(cupti) include(cupti)
......
...@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") ...@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
...@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ...@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(MKLML)\n" "PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n" "cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n" "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
" DESTINATION ${MKLML_DST_DIR})\n") " DESTINATION ${MKLML_DST_DIR})\n")
ExternalProject_Add( ExternalProject_Add(
......
INCLUDE(ExternalProject)
SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
ExternalProject_Add(
extern_threadpool
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/progschj/ThreadPool.git"
GIT_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040
PREFIX ${THREADPOOL_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
add_library(simple_threadpool STATIC ${dummyfile})
else()
add_library(simple_threadpool INTERFACE)
endif()
add_dependencies(simple_threadpool extern_threadpool)
LIST(APPEND external_project_dependencies simple_threadpool)
...@@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME) ...@@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME)
get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
#FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# for now to enable dist CI.
protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
...@@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME) ...@@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME)
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
--plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
"${ABS_PROTO}"
DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
# FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
......
...@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND) ...@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir} DSTS ${dst_dir} ${dst_dir}
) )
elseif (WITH_MKLML)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
)
endif() endif()
# paddle fluid module # paddle fluid module
......
add_custom_target(paddle_apis ALL
DEPENDS paddle_v2_apis paddle_fluid_apis)
add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn)
add_subdirectory(v2) add_subdirectory(v2)
add_subdirectory(fluid) add_subdirectory(fluid)
digraph G {
subgraph cluster_init {
label="Initialization"
startup_program [label="startup", shape=box]
node_w_g0 [label="W\nGPU0"]
startup_program -> node_w_g0 [label="Initialize"]
node_w_g1 [label="W\nGPU1"]
node_w_g0 -> node_w_g1 [label="broadcast"]
}
subgraph cluster_train {
label="forward_backward"
subgraph cluster_gpu0 {
label="GPU0"
fc_0 [label="fc\nGPU0", shape=box]
hidden_0 [label="hidden\nGPU0"]
node_w_g0 -> fc_0
fc_0 -> hidden_0
loss0 [label="loss\nGPU0"]
hidden_0 -> loss0 [label="many ops omitted"]
scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
loss_g0 [label="loss_grad\nGPU0"]
scale_loss_0->loss_g0
fc_g_0 [label="w_grad\nGPU0", shape=box]
loss0 -> fc_g_0
loss_g0 -> fc_g_0
hidden_0 -> fc_g_0
}
subgraph cluster_gpu1 {
label="GPU1"
fc_1 [label="fc\nGPU1", shape=box]
hidden_1 [label="hidden\nGPU1"]
node_w_g1 -> fc_1
fc_1 -> hidden_1
loss1 [label="loss\nGPU1"]
hidden_1 -> loss1 [label="many ops omitted"]
scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
loss_g1 [label="loss_grad\nGPU1"]
scale_loss_1->loss_g1
fc_g_1 [label="w_grad\nGPU1", shape=box]
loss1 -> fc_g_1
loss_g1 -> fc_g_1
hidden_1 -> fc_g_1
}
}
all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
fc_g_0 -> all_reduce_w
fc_g_1 -> all_reduce_w
fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
all_reduce_w -> fc_g_0_merged
all_reduce_w -> fc_g_1_merged
subgraph cluster_optimization {
label="Optimization"
subgraph cluster_opt_gpu0 {
label="GPU0"
sgd_0 [label="SGD Op\nGPU0", shape=box]
fc_g_0_merged -> sgd_0
node_w_g0 -> sgd_0
optimized_w_0 [label="Optimized W\nGPU0"]
sgd_0 -> optimized_w_0
}
subgraph cluster_opt_gpu1 {
label="GPU1"
sgd_1 [label="SGD Op\nGPU1", shape=box]
fc_g_1_merged -> sgd_1
node_w_g1 -> sgd_1
optimized_w_1 [label="Optimized W\nGPU0"]
sgd_1 -> optimized_w_1
}
}
}
# ParallelExecutor
## Background
Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs.
ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
## Overview of MultiGPUs logic
The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
![alt](images/parallel_executor_overview.png)
There are several optimizations for this logic.
1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready.
* GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
* Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
3. The streams of computation, merge gradients and fetch data are different.
The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
| Number of GPUs | 1 | 2 | 3 | 4|
| --- | --- | --- | --- | --- |
| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
## Static single assignment Graph
[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
The data structure of `SSA` graph is:
```c++
struct VarHandleBase {
OpHandleBase* generated_op_;
vector<OpHandleBase*> pending_ops_;
string name;
Place place;
size_t version;
};
struct OpHandleBase {
vector<OpHandleBase*> inputs_;
vector<OpHnadleBase*> outputs_;
};
struct SSAGraph {
// vars on each devices.
// * the vars in each map in vector is on different device.
// * the map is mapping a variable name to variable handles
// with different versions
vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
// All ops
vector<OpHandleBase> ops_;
};
```
The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
## Execute SSA Graph
The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
1. Maintaining a map of an operator and its needed input number.
2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
3. If there is an operator which needed input number is decreased to zero, just run this operator.
4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
## Synchronize GPU Kernels
The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
1. `OpHandle` will record `DeviceContext` that it is used.
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
The `wait` are implemented by two strategies:
1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
## What's next?
* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
...@@ -27,6 +27,8 @@ sphinx_add_target(paddle_fluid_docs ...@@ -27,6 +27,8 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_docs gen_proto_py)
# configured documentation tools and intermediate build results # configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
...@@ -47,3 +49,7 @@ sphinx_add_target(paddle_fluid_docs_cn ...@@ -47,3 +49,7 @@ sphinx_add_target(paddle_fluid_docs_cn
${SPHINX_CACHE_DIR_CN} ${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN}) ${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_fluid_docs_cn gen_proto_py)
add_subdirectory(api)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY)
sphinx_add_target(paddle_fluid_apis
html
${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind)
../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
../../v2/build_and_install/index_cn.rst
\ No newline at end of file
../../v2/build_and_install/index_en.rst
\ No newline at end of file
../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
梯度更新算法
------------
.. toctree::
:maxdepth: 1
parameter_average.md
Gradient Update Algorithm
--------------------------------------
.. toctree::
:maxdepth: 1
parameter_average.md
...@@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su ...@@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su
Here are some initial thoughts. Your comments are welcome! Here are some initial thoughts. Your comments are welcome!
### Required CMake Function # Required CMake Function
I think we need only the following few CMake functions to make a project description mean and clean: I think we need only the following few CMake functions to make a project description mean and clean:
...@@ -25,7 +25,7 @@ Also, ...@@ -25,7 +25,7 @@ Also,
- to describe external dependencies, we need `external_library`. - to describe external dependencies, we need `external_library`.
- to build shared libraries, we need `shared_library`. - to build shared libraries, we need `shared_library`.
### An Example Project ## An Example Project
Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files: Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files:
...@@ -102,11 +102,11 @@ shared_library(api ...@@ -102,11 +102,11 @@ shared_library(api
``` ```
### Implementation ## Implementation
As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
### Using Package Manager For Go ## Using Package Manager For Go
Building Go binaries and libraries need to satisfy their dependencies, generally Building Go binaries and libraries need to satisfy their dependencies, generally
we can do `go get ./...` to download and compile all external dependencies. The we can do `go get ./...` to download and compile all external dependencies. The
...@@ -122,7 +122,7 @@ problems are: ...@@ -122,7 +122,7 @@ problems are:
at many cloud file hosting, so users what to compile paddle by themselves can at many cloud file hosting, so users what to compile paddle by themselves can
download this "vendor" package from a mirror site. download this "vendor" package from a mirror site.
#### Choose A Suitable Tool ### Choose A Suitable Tool
As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
list dozens of Go package managers. We choose the tool using following principles: list dozens of Go package managers. We choose the tool using following principles:
...@@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve ...@@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve
such problems, but it's currently at Alpha stage. So the best choice now is such problems, but it's currently at Alpha stage. So the best choice now is
glide obviously. glide obviously.
#### Manage Go Packages ### Manage Go Packages
- Dependencies: `go/glide.yaml` will store the dependencies and their versions which - Dependencies: `go/glide.yaml` will store the dependencies and their versions which
is directly imported by paddle. `go/glide.lock` will store all dependencies recursively is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
......
...@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e ...@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e
To create and invoke readers, some new ops are introduced: To create and invoke readers, some new ops are introduced:
### CreateReaderOp ### Operators That Create Readers
Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers. Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
...@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader) ...@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader)
The forwarding ops of the corresponding `main_program` would be like this: The forwarding ops of the corresponding `main_program` would be like this:
``` ```
while_op { not_completed = true
pass_count = 0
while_op(not_completed) {
has_next = has_next_op(double_buffer_reader) has_next = has_next_op(double_buffer_reader)
if_else_op(has_next) { if_else_op(has_next) {
batch_data = read_op(double_buffer_reader) batch_data = read_op(double_buffer_reader)
... (subsequent training ops) ... (subsequent training ops)
} else { } else {
reset_op(double_buffer_reader) reset_op(double_buffer_reader)
increase_op(pass_count)
not_completed = less_than_op(pass_count, reqiured_pass_num)
} }
} }
``` ```
Two important considerations for these programs are as follows: A few important considerations for these programs are as follows:
1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. 1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
2. All readers exist in both `startup_program` and `main_program`. And they are persistable. 2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
### Simplify Configuration by MultiPassReader
The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
With `MultiPassReader`, the startup program would be like this:
```
multiple_reader = open_files_op(...)
batch_reader = create_batch_reader_op(multiple_reader)
multi_pass_reader = create_multi_pass_reader_op(batch_reader)
double_buffer_reader = create_double_buffer_op(multi_pass_reader)
... (other initializers)
```
The forwarding part of the corresponding `main_program` would be like this:
```
not_completed = true
while_op(not_completed) {
batch_data = read_op(double_buffer_reader)
... (subsequent training ops)
not_completed = has_next_op(double_buffer_reader)
}
```
核心概念
-------------
.. toctree::
:maxdepth: 1
README.md
cpp_data_feeding.md
functions_operators_layers.md
program.md
variable.md
var_desc.md
tensor.md
tensor_array.md
lod_tensor.md
block.md
scope.md
executor.md
Core Concepts
--------------------------------------
.. toctree::
:maxdepth: 1
README.md
cpp_data_feeding.md
functions_operators_layers.md
program.md
variable.md
var_desc.md
tensor.md
tensor_array.md
lod_tensor.md
block.md
scope.md
executor.md
...@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is ...@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is
A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
# Interface Design ## Interface Design
```cpp ```cpp
class Variable { class Variable {
......
# Design Doc: Var_desc
## Background ## Background
PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations. PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
......
# Channel Design
## Introduction
A Channel is a data structure that allows for synchronous interprocess
communication via message passing. It is a fundemental component of CSP
(communicating sequential processes), and allows for users to pass data
between threads without having to worry about synchronization.
## How to use it
Paddle offers python APIs to open and close channels, along with sending
and receiving data to/from a channel.
### Create a channel
Creates a new channel that takes in variables of a specific dtype.
- **fluid.make_channel(dtype, capacity=0)**
- **dtype**: The data type of variables being sent/received through channel
- **capacity**: The capacity of the channel. A capacity of 0 represents
an unbuffered channel. Capacity > 0 represents a buffered channel
```
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
```
### Close a channel
Closes a channel. Any pending senders and receivers will be awoken during
this time. Receivers can still receive from a closed channel, but senders
are not allowed to send any additional data to the channel (Paddle will
raise an exception if users try to send to a closed channel.)
- **fluid.channel_close(channel)**
```
fluid.channel_close(ch)
```
### Send data to a channel
Sends a variable to a channel. Currently, variables of dtype `LoDTensor`,
`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
`ChannelHolder` are supported.
By default, the data of the Variable is moved from the sender to the receiver,
however the user can optionally copy the data before performing the send.
- **channel_send(channel, variable, is_copy=False)**
- **channel**: The channel to send the variable to
- **variable**: The variable to send to the channel
- **is_copy**: If set to True, channel_send will perform a variable assign
to copy the source variable to a new variable to be sent.
```
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
fluid.channel_send(ch, var, True)
```
### Receive data from a channel
Receives a variable from a channel. The data of the variable is moved to the
receiving variable.
- **channel_recv(channel, return_variable)**
- **channel**: The channel to receive the variable from
- **return_variable**: The destination variable used to store the data of the
variable received from the channel
```
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
fluid.channel_recv(ch, var)
```
## How it Works
Channels provides a simple interface for different threads to share data.
To support the synchronization requirements, channels utilizes a series of
internal queues, locks, and conditional variables.
### QueueMessage
QueueMessage encapsulates the state of the channel send/receive operation to be
put in the **sendq/recvq**. It contains a condition variable used to lock the
thread (when there are no available sends/receives). In addition, it contains
a callback function to notify a thread when the QueueMessage is being
processed by the channel.
### Queues
- **buff_**: This queue holds the data buffer in a buffered channel. The
capacity is set to the capacity of the channel. This data buffer is not
used in an unbuffered channel.
- **sendq**: This queue holds the QueueMessage of any pending senders of a
channel. When a thread performs a channel_send operation on the channel, the
channel_send operation will put a new QueueMessage on the sendq and block the
current thread under two conditions:
1. The channel is buffered and is full
2. The channel is unbuffered and does not have a receiver
- **recvq**: This queue holds the QueueMessage of any pending receivers of a
channel. When a thread performs a channel_recv operation on the channel, the
channel_recv operation will put a new QueueMessage on the recvq and block the
current thread under two conditions:
1. The channel is buffered and there is no data on the buff_
2. The channel is unbuffered and does not have a sender
### State diagram
#### Channel Send
<p align="center">
<img src="./images/channel_send.png"/><br/>
</p>
#### Channel Receive
<p align="center">
<img src="./images/channel_recv.png"/><br/>
</p>
## Limitations and Considerations
### Variable Copy
In golang, variables in channels are copied from the sender to the receiver.
In Paddle, the data from our variables are **moved** from sender to receiver.
As a result, these variables should not be used after they are sent. We
provide a flag in channel_send method to allow users to copy the variable to
be sent before it is sent.
Please note that this is acheived by adding an **assign** operator and creating
a temporary variable that is sent in place of the original variable. Please
note that **assign** operator has limited support for only certain variables
datatypes.
# go_op Design
## Introduction
The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
thread. It works in conjuction with CSP operators (channel_send,
channel_receive, channel_open, channel_close, and select) to allow users to
concurrently process data and communicate easily between different threads.
## How to use it
```
channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
with fluid.Go():
# Send a tensor of value 99 to "channel" on a detached thread
tensor = fill_constant(shape=[1], dtype='int', value=99)
tensor.stop_gradient = True
fluid.channel_send(channel, tensor)
# Receive sent tensor from "channel" on the main thread
result = fill_constant(shape=[1], dtype='int', value=-1)
fluid.channel_recv(ch, result)
```
The go operator can be accessed by using the fluid.Go() control flow. This
will create a new sub block, where the user can add additional operators
to be ran on the thread.
**Note:** Since back propegation is currently not support in the go_op, users
should ensure that operators in the go block does not require gradient
calculations.
## How it Works
Similar to other control blocks, go_op will create a sub block and add it
as a child to the current block. Operators and variables defined in this
block will be added to the go sub_block.
In addition, the go operator will create a new child scope whose parent is
the global scope. Please refer to [block captures](#block-captures) for more
information.
When Paddle executor runs go_op, go_op will take the sub_block and pass it to
the executor.run method (along with a newly created local scope) on a detached
thread.
An example of the generated program description is shown below. Take note of
the **go_op** in particular. It is added as an operator in the current
block (in this example, block0). The **go_op** contains a `sub_block`
attribute, which points to the id of the block that will be executed in a
detached thread.
```
blocks {
idx: 0
parent_idx: -1
vars {
name: "return_value"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: INT64
}
}
}
}
vars {
name: "status_recv"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: BOOL
}
}
}
}
...
ops {
outputs {
parameter: "Out"
arguments: "channel"
}
type: "channel_create"
attrs {
name: "data_type"
type: INT
i: 7
}
attrs {
name: "capacity"
type: INT
i: 0
}
}
ops {
inputs {
parameter: "X"
arguments: "channel"
}
type: "go"
attrs {
name: "sub_block"
type: BLOCK
block_idx: 1
}
}
ops {
inputs {
parameter: "Channel"
arguments: "channel"
}
outputs {
parameter: "Out"
arguments: "return_value"
}
outputs {
parameter: "Status"
arguments: "status_recv"
}
type: "channel_recv"
}
...
}
blocks {
idx: 1
parent_idx: 0
vars {
name: "status"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: BOOL
}
}
}
}
...
ops {
outputs {
parameter: "Out"
arguments: "fill_constant_1.tmp_0"
}
type: "fill_constant"
attrs {
name: "force_cpu"
type: BOOLEAN
b: false
}
attrs {
name: "value"
type: FLOAT
f: 99.0
}
attrs {
name: "shape"
type: INTS
ints: 1
}
attrs {
name: "dtype"
type: INT
i: 3
}
}
ops {
inputs {
parameter: "Channel"
arguments: "channel"
}
inputs {
parameter: "X"
arguments: "fill_constant_1.tmp_0"
}
outputs {
parameter: "Status"
arguments: "status"
}
type: "channel_send"
attrs {
name: "copy"
type: BOOLEAN
b: false
}
}
```
## Current Limitations
#### <a name="block-captures"></a>Scopes and block captures:
Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
block. When a block is executed, a new local scope is created from the parent
scope (ie: scope derived from the parent block) and associated with the new
child block. After the block finishes executing, then the local scope and
all associated variables in the scope is deleted.
This works well in a single threaded scenario, however with introduction of
go_op, a child block may continue to execute even after the parent block has
exited. If the go_op tries to access variables located in the parent block's
scope, it may receive a segmentation fault because the parent scope may have
been deleted.
We need to implement block closures in order to prevent access to parent
scope variables from causing a segmentation fault. As a temporary workaround,
please ensure that all variables accessed in the go block is not destructed
before it is being accessed. Currently, the go_op will explicitly enforce
this requirement and raise an exception if a variable could not be found in
the scope.
Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
for more details.
#### Green Threads
Golang utilizes `green threads`, which is a mechnism for the runtime library to
manage multiple threads (instead of natively by the OS). Green threads usually
allows for faster thread creation and switching, as there is less overhead
when spawning these threads. For the first version of CSP, we only support
OS threads.
#### Backward Propegation:
go_op currently does not support backwards propagation. Please use go_op with
non training operators.
并发编程
------------
.. toctree::
:maxdepth: 1
concurrent_programming.md
parallel_do.md
Concurrent Programming
-------------------------
.. toctree::
:maxdepth: 1
concurrent_programming.md
parallel_do.md
数据类型
------------
.. toctree::
:maxdepth: 1
float16.md
Data Type
------------
.. toctree::
:maxdepth: 1
float16.md
## Design Doc: Distributed Lookup Table Operator # Design Doc: Distributed Lookup Table Operator
A lookup table operator in PaddlePaddle where the table could be out A lookup table operator in PaddlePaddle where the table could be out
of the memory of a computer. of the memory of a computer.
......
分布式训练
------------
.. toctree::
:maxdepth: 1
distributed_architecture.md
distributed_lookup_table_design.md
parameter_server.md
Distributed Training
---------------------
.. toctree::
:maxdepth: 1
distributed_architecture.md
distributed_lookup_table_design.md
parameter_server.md
动态RNN
------------
.. toctree::
:maxdepth: 1
rnn.md
rnn_design.md
Dynamic RNN
------------
.. toctree::
:maxdepth: 1
rnn.md
rnn_design.md
...@@ -233,7 +233,10 @@ x x ...@@ -233,7 +233,10 @@ x x
- 将每个序列concat 为规则的mini-batch表示 - 将每个序列concat 为规则的mini-batch表示
## 参考文献 ## 参考文献
1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
执行流程
-------------
.. toctree::
:maxdepth: 1
switch.md
if_else_op.md
Execution Process
--------------------------------------
.. toctree::
:maxdepth: 1
switch.md
if_else_op.md
### Design Doc: Switch # Design Doc: Switch
### Background ## Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
...@@ -19,7 +19,7 @@ with switch() as switch: ...@@ -19,7 +19,7 @@ with switch() as switch:
fluid.print("Case 3") fluid.print("Case 3")
``` ```
### The Semantics ## The Semantics
1. A `switch` control-flow checks cases one-by-one. 1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
......
设计思想 设计思想
------------ ------------
.. toctree::
:maxdepth: 1
motivation/index_cn.rst
execution/index_cn.rst
concepts/index_cn.rst
data_type/index_cn.rst
memory/index_cn.rst
muti_devices/index_cn.rst
dynamic_rnn/index_cn.rst
concurrent/index_cn.rst
algorithm/index_cn.rst
network/index_cn.rst
modules/index_cn.rst
interface/index_cn.rst
dist_train/index_cn.rst
Design Design
------------ ------------
.. toctree::
:maxdepth: 1
motivation/index_en.rst
execution/index_en.rst
concepts/index_en.rst
data_type/index_en.rst
memory/index_en.rst
muti_devices/index_en.rst
dynamic_rnn/index_en.rst
concurrent/index_en.rst
algorithm/index_en.rst
network/index_en.rst
modules/index_en.rst
interface/index_en.rst
dist_train/index_en.rst
多语言接口
------------
TBD
Multi-Language Interface
-----------------------
TBD
内存管理
------------
.. toctree::
:maxdepth: 1
memory_optimization.md
Memory Management
-------------------
.. toctree::
:maxdepth: 1
memory_optimization.md
## Evaluator Design # Evaluator Design
### Problem Statement ## Problem Statement
During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
### Evaluator Design ## Evaluator Design
Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block. 1. Initialize the metric state and add it into the block.
...@@ -14,7 +14,7 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr ...@@ -14,7 +14,7 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation ## Implementation
This design is shown in the Python API. This design is shown in the Python API.
Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
......
代码结构和重要模块
-----------------
.. toctree::
:maxdepth: 1
backward.md
python_api.md
regularization.md
infer_var_type.md
optimizer.md
prune.md
register_grad_op.md
net_op_design.md
Code Structure and Important Modules
-------------------------------------
.. toctree::
:maxdepth: 1
backward.md
python_api.md
regularization.md
infer_var_type.md
optimizer.md
prune.md
register_grad_op.md
net_op_design.md
...@@ -8,9 +8,9 @@ A network object knows all Operators belonging to this network. Variables, ...@@ -8,9 +8,9 @@ A network object knows all Operators belonging to this network. Variables,
which are inputs and outputs of these operators, which are inputs and outputs of these operators,
are created and managed by a hierarchy of Scope objects. are created and managed by a hierarchy of Scope objects.
# API ## API
## Net ### Net
To make the `Network` extendable, a base class is defined like this To make the `Network` extendable, a base class is defined like this
```c++ ```c++
...@@ -80,7 +80,7 @@ if (net) { ...@@ -80,7 +80,7 @@ if (net) {
} }
``` ```
## `PlainNet` as a simple implementation of `BaseNet` ### `PlainNet` as a simple implementation of `BaseNet`
A very basic implementation is as follows. All it does is simply to run every operators in sequence. A very basic implementation is as follows. All it does is simply to run every operators in sequence.
...@@ -211,7 +211,7 @@ class NetBuilder final { ...@@ -211,7 +211,7 @@ class NetBuilder final {
} }
``` ```
## Compatibility with RNN ### Compatibility with RNN
Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
for example we can implement a simple recurrent neural network as follows for example we can implement a simple recurrent neural network as follows
......
## Optimizer Design # Optimizer Design
### The Problem ## The Problem
A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works:
...@@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca ...@@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca
In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
### High-level Python API to describe the training process ## High-level Python API to describe the training process
1. User write code to describe the network: 1. User write code to describe the network:
...@@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim ...@@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim
sess.run(target= opt_op_list, ...) sess.run(target= opt_op_list, ...)
``` ```
#### Optimizer Python interface: ### Optimizer Python interface:
```python ```python
class Optimizer(object): class Optimizer(object):
......
设计动机和目标
-------------
.. toctree::
:maxdepth: 1
api.md
refactorization.md
fluid.md
fluid_compiler.md
Design Motivations and Goals
--------------------------------------
.. toctree::
:maxdepth: 1
api.md
refactorization.md
fluid.md
fluid_compiler.md
...@@ -97,13 +97,13 @@ Compile Time -> IR -> Runtime ...@@ -97,13 +97,13 @@ Compile Time -> IR -> Runtime
--- ---
# Operator/OpWithKernel/OpKernel ## Operator/OpWithKernel/OpKernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
--- ---
# Operator ## Operator
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
* `Operator` is the fundamental building block of the user interface. * `Operator` is the fundamental building block of the user interface.
...@@ -113,7 +113,7 @@ Compile Time -> IR -> Runtime ...@@ -113,7 +113,7 @@ Compile Time -> IR -> Runtime
--- ---
# OpWithKernel/Kernel ## OpWithKernel/Kernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
...@@ -124,7 +124,7 @@ Compile Time -> IR -> Runtime ...@@ -124,7 +124,7 @@ Compile Time -> IR -> Runtime
--- ---
# Why separate Kernel and Operator ## Why separate Kernel and Operator
* Separate GPU and CPU code. * Separate GPU and CPU code.
* Make Paddle capable of running without GPU. * Make Paddle capable of running without GPU.
...@@ -132,7 +132,7 @@ Compile Time -> IR -> Runtime ...@@ -132,7 +132,7 @@ Compile Time -> IR -> Runtime
* For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
--- ---
# Libraries for Kernel development ## Libraries for Kernel development
* `Eigen::Tensor` contains basic math and element-wise functions. * `Eigen::Tensor` contains basic math and element-wise functions.
* Note that `Eigen::Tensor` has broadcast implementation. * Note that `Eigen::Tensor` has broadcast implementation.
...@@ -143,16 +143,16 @@ Compile Time -> IR -> Runtime ...@@ -143,16 +143,16 @@ Compile Time -> IR -> Runtime
* Hand-writing `GPUKernel` and `CPU` code * Hand-writing `GPUKernel` and `CPU` code
* Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
--- ---
# Operator Registration ## Operator Registration
## Why is registration necessary? ### Why is registration necessary?
We need a method to build mappings between Op type names and Op classes. We need a method to build mappings between Op type names and Op classes.
## How is registration implemented? ### How is registration implemented?
Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
--- ---
# The Registry Map ## The Registry Map
### `OpInfoMap` ### `OpInfoMap`
...@@ -166,7 +166,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding ...@@ -166,7 +166,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
- **`checker`**: Used to check attributes. - **`checker`**: Used to check attributes.
--- ---
# Related Concepts ## Related Concepts
### Op_Maker ### Op_Maker
It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
...@@ -178,7 +178,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -178,7 +178,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
``` ```
--- ---
# Registration Process ## Registration Process
1. Write an Op class and its gradient Op class, if required. 1. Write an Op class and its gradient Op class, if required.
2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
3. Invoke the macro `REGISTER_OP`. This macro will 3. Invoke the macro `REGISTER_OP`. This macro will
...@@ -186,13 +186,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -186,13 +186,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
--- ---
# Backward Module (1/2) ## Backward Module (1/2)
### Create Backward Operator ### Create Backward Operator
- Mapping from forward Op to backward Op - Mapping from forward Op to backward Op
![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
--- ---
# Backward Module (2/2) ## Backward Module (2/2)
### Build Backward Network ### Build Backward Network
- **Input**: a graph of forward operators - **Input**: a graph of forward operators
- **Output**: a graph of backward operators - **Output**: a graph of backward operators
...@@ -205,7 +205,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -205,7 +205,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
--- ---
# Scope, Variable, Tensor ## Scope, Variable, Tensor
* `Tensor` is an n-dimension array with type. * `Tensor` is an n-dimension array with type.
* Only dims and data pointers are stored in `Tensor`. * Only dims and data pointers are stored in `Tensor`.
...@@ -218,8 +218,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -218,8 +218,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
--- ---
# Block (in design) ## Block (in design)
## the difference between original RNNOp and Block ### the difference between original RNNOp and Block
- As an operator is more intuitive than `RNNOp`, - As an operator is more intuitive than `RNNOp`,
- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- Fits the compile-time/ runtime separation design paradigm. - Fits the compile-time/ runtime separation design paradigm.
...@@ -227,7 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -227,7 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
- When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
--- ---
# Milestone ## Milestone
- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, - Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- Model migration - Model migration
- Framework development gives **priority support** to model migration, for example, - Framework development gives **priority support** to model migration, for example,
...@@ -240,7 +240,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -240,7 +240,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
- Accept imperfection, concentrate on solving the specific problem at the right price. - Accept imperfection, concentrate on solving the specific problem at the right price.
--- ---
# Control the migration quality ## Control the migration quality
- Compare the performance of migrated models with old ones. - Compare the performance of migrated models with old ones.
- Follow the google C++ style guide. - Follow the google C++ style guide.
- Build the automatic workflow of generating Python/C++ documentations. - Build the automatic workflow of generating Python/C++ documentations.
......
多设备支持
------------
.. toctree::
:maxdepth: 1
operator_kernel_type.md
kernel_selection.md
kernel_hint_design.md
Multi-Device Support
----------------------
.. toctree::
:maxdepth: 1
operator_kernel_type.md
kernel_selection.md
kernel_hint_design.md
## Problem # Problem
In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
In the current design, we use KernelType to describe one kernel. In the current design, we use KernelType to describe one kernel.
......
## Background # Background
Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold. Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
The `OpKernelType ` is as follows: The `OpKernelType ` is as follows:
......
复杂网络设计
------------
.. toctree::
:maxdepth: 1
sequence_decoder.md
Complex Network Design
------------------------
.. toctree::
:maxdepth: 1
sequence_decoder.md
# API Doc Standard
- [API Doc Structure](#API Doc Structure)
- [Format and Examples](#Format and Examples)
- [Complete Example](#Complete Example)
## API Doc Structure
API Doc should contain the following parts(please write them in order):
- Python API Definition
The definition of API
- Function Description
Description of API's function.
The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
- Args Description
Description of API parameters.
Introduce parameters one by one according to the order in API definition.
The introduction includes: data type, default value(if any), meaning, etc.
- Returns
Introduction of API returned value.
Introduce meaning of returned value, provide correspoding format if necessary.
If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
- Raises(if any)
Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order.
- Note(if any)
Matters needing attention. If there are more than one matters, they should be listed in order.
- Examples
Examples of how to use API.
## Format and Examples
API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
Format and examples of each part of API documantation are as follows: (take fc for example)
- Python API Definition
- Format
[Python API Definition]
- Example
```
fc(input,
size,
num_flatten_dims=1,
param_attr=None,
bias_attr=None,
act=None,
name=None,
main_program=None,
startup_program=None)
```
- Function Description
- Format
This part contains (please write them in order):
[Function Description]
[Formula]
[Symbols' Descriptions if necessary]
[References if necessary]
- Example
[Function Description]
```
**Fully Connected Layer**
The fully connected layer can take multiple tensors as its inputs. It
creates a variable called weights for each input tensor, which represents
a fully connected weight matrix from each input unit to each output unit.
The fully connected layer multiplies each input tensor with its coresponding
weight to produce an output Tensor. If multiple input tensors are given,
the results of multiple multiplications will be sumed up. If bias_attr is
not None, a bias variable will be created and added to the output. Finally,
if activation is not None, it will be applied to the output as well.
```
[Formula]
```
This process can be formulated as follows:
.. math::
Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
```
[Symbols' Descriptions if necessary]
```
In the above equation:
* :math:`N`: Number of the input.
* :math:`X_i`: The input tensor.
* :math:`W`: The weights created by this layer.
* :math:`b`: The bias parameter created by this layer (if needed).
* :math:`Act`: The activation function.
* :math:`Out`: The output tensor.
```
[References if necessary]
Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example:
```
Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
```
- Args Description
- Format
\[Arg's Name\][(Data Type, Default Value)][Description]
- Example
part of fc parameters are as follows:
```
Args:
input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
the input tensor(s) is at least 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer.
name (str, default None): The name of this layer.
```
- Returns
- Format
[Name][Shape]
- Example
```
Returns:
A tensor variable storing the transformation result.
```
when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
```
Returns:
A tuple containing:
The hidden state of LSTM whose shape is (T X D).
The cell state of LSTM whose shape is (T X D).
```
- Raises
- Format
[Exception Type][Condition]
- Example
```
Raises:
ValueError: If the rank of the input is less than 2.
```
- Note
- Format
[Note]
- Example
there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
```
Note:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
2. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
```
- Examples
- Format
\[Python Code Snipper]
- Example
```
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
fc = fluid.layers.fc(input=data, size=1000, act="tanh")
```
## Complete Example
Complete Example of fc please see [here](src/fc.py)
开发标准 开发标准
------------ ------------
.. toctree::
:maxdepth: 1
new_op_en.md
new_op_kernel_en.md
use_eigen_en.md
name_convention.md
support_new_device.md
releasing_process.md
op_markdown_format.md
Development Development
------------ ------------
This is Development page .. toctree::
:maxdepth: 1
new_op_en.md
new_op_kernel_en.md
use_eigen_en.md
name_convention.md
support_new_device.md
releasing_process.md
op_markdown_format.md
## Operator's Parameter Name Convention # Operator's Parameter Name Convention
To make the operator document itself more clear, we recommend operator names obey the listing conventions. To make the operator document itself more clear, we recommend operator names obey the listing conventions.
### OpProtoMaker names ## OpProtoMaker names
When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator. When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
...@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith ...@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
- Order. - Order.
- Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice. - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
### Best Practice ## Best Practice
Here we give some examples to show how these rules will be used. Here we give some examples to show how these rules will be used.
......
## Add Kernels for a New Device # Add Kernels for a New Device
### Background ## Background
PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU. PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). [This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
### Write Kernels for A New Device ## Write Kernels for A New Device
#### Add A New Device ### Add A New Device
For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP. For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP.
...@@ -23,7 +23,7 @@ enum class LibraryType { ...@@ -23,7 +23,7 @@ enum class LibraryType {
``` ```
#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53) ### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`: If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
...@@ -45,7 +45,7 @@ struct CUDAPlace { ...@@ -45,7 +45,7 @@ struct CUDAPlace {
typedef boost::variant<CUDAPlace, CPUPlace> Place; typedef boost::variant<CUDAPlace, CPUPlace> Place;
``` ```
#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37)) ### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it. After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
```cpp ```cpp
...@@ -58,7 +58,7 @@ class DeviceContext { ...@@ -58,7 +58,7 @@ class DeviceContext {
}; };
``` ```
#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device. ### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
...@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase { ...@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase {
``` ```
#### Register the OpKernel to framework ### Register the OpKernel to framework
After writing the components described above, we should register the kernel to the framework. After writing the components described above, we should register the kernel to the framework.
......
...@@ -15,26 +15,26 @@ The signature of the operator. ...@@ -15,26 +15,26 @@ The signature of the operator.
Each section mentioned above has been covered in further detail in the rest of the document. Each section mentioned above has been covered in further detail in the rest of the document.
# PaddlePaddle Operator Name ## PaddlePaddle Operator Name
This should be in all small letters, in case of multiple words, we separate them with an underscore. For example: This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
`array to lod tensor` should be written as `array_to_lod_tensor`. `array to lod tensor` should be written as `array_to_lod_tensor`.
This naming convention should be standard across all PaddlePaddle operators. This naming convention should be standard across all PaddlePaddle operators.
# Standard Operator Name ## Standard Operator Name
This is the standard name of the operator as used in the community. The general standard is usually: This is the standard name of the operator as used in the community. The general standard is usually:
- Standard abbreviations like `SGD` are written in all capital letters. - Standard abbreviations like `SGD` are written in all capital letters.
- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word). - Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
- Keep numbers inside a word as is, with no boundary delimiters. - Keep numbers inside a word as is, with no boundary delimiters.
- Follow the name of the operator with the keyword: `Activation Operator.` - Follow the name of the operator with the keyword: `Activation Operator.`
# Operator description ## Operator description
This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section. This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
# LaTeX equation ## LaTeX equation
This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`). This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
# The signature ## The signature
This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is: This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
`Section : `Section :
VariableName : (VariableType) VariableDescription VariableName : (VariableType) VariableDescription
......
## 在Paddle中如何使用Eigen # 在Paddle中如何使用Eigen
神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。 神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
### Eigen Tensor模块 ## Eigen Tensor模块
Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
### paddle::framework::Tensor ## paddle::framework::Tensor
Paddle Tensor定义在framework目录下,其主要接口如下: Paddle Tensor定义在framework目录下,其主要接口如下:
...@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework ...@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework
### 实现计算 ## 实现计算
当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。 当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。
......
## How to use Eigen in Paddle # How to use Eigen in Paddle
Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
### Eigen Tensor Module ## Eigen Tensor Module
The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
...@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c ...@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c
For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
### paddle::framework::Tensor ## paddle::framework::Tensor
Paddle Tensor's is defined in the framework directory with the following interface: Paddle Tensor's is defined in the framework directory with the following interface:
...@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override { ...@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override {
``` ```
### paddle::framework::Tensor到EigenTensor的转换 ## paddle::framework::Tensor到EigenTensor的转换
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
...@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P ...@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P
### Implementing Computation ## Implementing Computation
While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
......
基本使用概念
============
TBD
新手入门 新手入门
------------ ============
新手入门
如果需要快速了解PaddlePaddle的使用,可以参考以下指南。
.. toctree::
:maxdepth: 1
quickstart_cn.rst
在使用PaddlePaddle构建应用时,需要了解一些基本概念。
这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。
.. toctree::
:maxdepth: 1
concepts/use_concepts_cn.rst
GET STARTED GET STARTED
------------ ============
This is get started page If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
.. toctree::
:maxdepth: 1
quickstart_en.rst
While using PaddlePaddle to build applications, please understand some basic concepts.
Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
.. toctree::
:maxdepth: 1
concepts/index_en.rst
../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
进阶使用 进阶使用
------------ ------------
.. toctree::
:maxdepth: 1
optimization/index_cn.rst
HOW TO HOW TO
------------ ------------
This is how to page .. toctree::
:maxdepth: 1
optimization/index_en.rst
../../../../../benchmark/cluster/README.md
\ No newline at end of file
基准
------------
.. toctree::
:maxdepth: 1
vgg16/README.md
README.md
Benchmark
------------
.. toctree::
:maxdepth: 1
vgg16/README.md
README.md
../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
...@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大 ...@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
* Python 与 C++ 混合代码的性能分析 * Python 与 C++ 混合代码的性能分析
## Python代码的性能分析 # Python代码的性能分析
### 生成性能分析文件 ### 生成性能分析文件
......
...@@ -14,7 +14,7 @@ the profiling and tuning of ...@@ -14,7 +14,7 @@ the profiling and tuning of
1. the Python code and 1. the Python code and
1. the mixture of Python and C++ code. 1. the mixture of Python and C++ code.
## Profiling the Python Code # Profiling the Python Code
### Generate the Performance Profiling File ### Generate the Performance Profiling File
......
性能优化
------------
.. toctree::
:maxdepth: 1
timeline.md
cpu_profiling_cn.md
benchmark/index_cn.rst
Performance Optimization
---------------------------
.. toctree::
:maxdepth: 1
timeline.md
cpu_profiling_en.md
benchmark/index_en.rst
## how to use timeline tool to do profile # how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
:maxdepth: 1 :maxdepth: 1
getstarted/index_cn.rst getstarted/index_cn.rst
design/index_cn.rst
build_and_install/index_cn.rst build_and_install/index_cn.rst
design/index_cn.rst
howto/index_cn.rst howto/index_cn.rst
dev/index_cn.rst dev/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
:maxdepth: 1 :maxdepth: 1
getstarted/index_en.rst getstarted/index_en.rst
design/index_en.rst
build_and_install/index_en.rst build_and_install/index_en.rst
design/index_en.rst
howto/index_en.rst howto/index_en.rst
dev/index_en.rst dev/index_en.rst
faq/index_en.rst faq/index_en.rst
...@@ -20,13 +20,15 @@ configure_file( ...@@ -20,13 +20,15 @@ configure_file(
"${BINARY_BUILD_DIR_EN}/conf.py" "${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY) @ONLY)
sphinx_add_target(paddle_docs sphinx_add_target(paddle_v2_docs
html html
${BINARY_BUILD_DIR_EN} ${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN} ${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_docs gen_proto_py)
# configured documentation tools and intermediate build results # configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
...@@ -41,11 +43,13 @@ configure_file( ...@@ -41,11 +43,13 @@ configure_file(
"${BINARY_BUILD_DIR_CN}/conf.py" "${BINARY_BUILD_DIR_CN}/conf.py"
@ONLY) @ONLY)
sphinx_add_target(paddle_docs_cn sphinx_add_target(paddle_v2_docs_cn
html html
${BINARY_BUILD_DIR_CN} ${BINARY_BUILD_DIR_CN}
${SPHINX_CACHE_DIR_CN} ${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN}) ${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_v2_docs_cn gen_proto_py)
add_subdirectory(api) add_subdirectory(api)
...@@ -12,9 +12,11 @@ configure_file( ...@@ -12,9 +12,11 @@ configure_file(
"${BINARY_BUILD_DIR_EN}/conf.py" "${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY) @ONLY)
sphinx_add_target(paddle_api_docs sphinx_add_target(paddle_v2_apis
html html
${BINARY_BUILD_DIR_EN} ${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN} ${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind)
多语言接口
------------
.. toctree::
:maxdepth: 1
00.why_plain_c.md
Multilingual Interface
-----------------------
.. toctree::
:maxdepth: 1
00.why_plain_c.md
Development Development
------------ ------------
PaddlePaddle adheres to the following three sections of code and document specifications.
PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages,which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
contribute_to_paddle_en.md contribute_to_paddle_en.md
PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
.. toctree::
:maxdepth: 1
write_docs_en.rst write_docs_en.rst
PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
.. toctree::
:maxdepth: 1
new_layer_en.rst new_layer_en.rst
...@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二 ...@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二
touch ../extern_mklml-stamp/extern_mklml-download touch ../extern_mklml-stamp/extern_mklml-download
// 4. 接着编译即可 // 4. 接着编译即可
9. 在Mac上无法安装numpy等Python包,权限错误
------------------
Mac上对自带的Python和包有严格的权限保护,最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝,并在这多个拷贝之间自由切换,这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境:
安装virtualenv:
::::::::::::::::
virtualenv本身也是Python的一个包,可以用pip进行安装:
.. code-block:: bash
sudo -H pip install virtualenv
由于virtualenv需要安装给系统自带的Python,因此需要使用sudo权限。
创建一个新的Python运行环境:
:::::::::::::::::::
.. code-block:: bash
virtualenv --no-site-packages paddle
--no-site-packages 参数表示不拷贝已有的任何第三方包,创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
执行完这一步后,当前目录下应该会出现一个名为paddle(或者你取的其他名字)的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
启动运行环境:
::::::::::::::::
.. code-block:: bash
source paddle/bin/activate
执行后会发现命令提示符前面增加了(paddle)字样,说明已经成功启动了名为‘paddle’的Python环境。执行which python,可以发现使用的已经是刚刚创建的paddle目录下的Python。
在这个环境中,我们可以自由地进行Paddle的安装、使用和开发工作,无需担心对系统自带Python的影响。
退出运行环境:
:::::::::::::::
直接执行:
.. code-block:: bash
deactivate
可以看到命令提示符前面的(paddle)字样消失。
自动启动某一Python环境:
::::::::::::::::
如果我们经常使用Paddle,我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境,比较繁琐。为了简便,可以修改终端的配置文件,来让终端每次启动后自动启动特定的Python环境。
执行:
.. code-block:: bash
vi ~/.bash_profile
打开终端配置文件,并在文件的最后添加一行:
.. code-block:: bash
source paddle/bin/activate
保存并关闭文件。
这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。
...@@ -2,4 +2,15 @@ ...@@ -2,4 +2,15 @@
Cluster Training and Prediction Cluster Training and Prediction
############################### ###############################
TBD .. contents::
1. Network connection errors in the log during multi-node cluster training
------------------------------------------------
There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
...@@ -2,4 +2,80 @@ ...@@ -2,4 +2,80 @@
Model Configuration Model Configuration
################### ###################
TBD .. contents::
1. How to deal with error :code:`Duplicated layer name`
----------------------------------------------------------
The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
----------------------------------------------------------------------
* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus, :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name` and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
3. What is the difference between the two ways of using dropout
-----------------------------------------------------------------
* There are two ways to use dropout in PaddlePaddle
* Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
.. code-block:: python
fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
* Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
.. code-block:: python
fc = paddle.layer.fc(input=input)
drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
* PaddlePaddle implements dropout in the activation function rather than in the layer.
* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
4. The differences between different recurrent layers
--------------------------------------------------------
Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
* :code:`paddle.layer.lstmemory`
* :code:`paddle.networks.simple_lstm`
* :code:`paddle.networks.lstmemory_group`
* :code:`paddle.networks.bidirectional_lstm`
According to implementations, recurrent layer can be classified into 2 types:
1. Recurrent layer implemented by recurrent_group:
* Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
* :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
2. Recurrent layer implemented as a complete operation:
* Users can only access output values when using this type of recurrent layers.
* :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer;
By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
* Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
* :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
5. Can Softmax's calculation dimension be specified?
--------------------------------------------------------------------
We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
6. Does PaddlePaddle support variable-dimensional data inputs
----------------------------------------------------------------
PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
...@@ -2,10 +2,25 @@ ...@@ -2,10 +2,25 @@
Set Command-line Parameters Set Command-line Parameters
=========================== ===========================
The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
use_case_en.md use_case_en.md
Then, we summarize and classify the use of all command-line parameters:
.. toctree::
:maxdepth: 1
arguments_en.md arguments_en.md
Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
.. toctree::
:maxdepth: 1
detail_introduction_en.md detail_introduction_en.md
...@@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line ...@@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cmd_parameter/index_cn.rst cmd_parameter/index_en.rst
PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to: PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cluster/index_cn.rst cluster/index_en.rst
PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API: PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
capi/index_cn.rst capi/index_en.rst
PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to: PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
rnn/index_cn.rst rnn/index_en.rst
How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to: How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
optimization/gpu_profiling_cn.rst optimization/gpu_profiling_en.rst
if(NOT WITH_FLUID) if(NOT WITH_FLUID_ONLY)
add_subdirectory(cuda) add_subdirectory(cuda)
add_subdirectory(function) add_subdirectory(function)
add_subdirectory(utils) add_subdirectory(utils)
......
add_subdirectory(details)
# ddim lib # ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
...@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo ...@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table feed_fetch_method) framework_proto backward glog lod_rank_table feed_fetch_method)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
......
...@@ -147,15 +147,52 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { ...@@ -147,15 +147,52 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return; return;
} }
auto get_vars = [](std::deque<std::unique_ptr<OpDesc>>::iterator &op,
std::vector<std::string> &v) {
auto in_names = (*op)->InputArgumentNames();
v.insert(v.end(), in_names.begin(), in_names.end());
auto out_names = (*op)->OutputArgumentNames();
v.insert(v.end(), out_names.begin(), out_names.end());
std::sort(v.begin(), v.end());
auto last = std::unique(v.begin(), v.end());
v.erase(last, v.end());
};
need_update_ = true; need_update_ = true;
for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
auto names = (*it)->InputArgumentNames(); for (size_t i = s; i < e; i++) {
for (auto n : names) { // since remove op one by one, every time remove the first op.
// TODO(typhoonzero): delete vars if no other op use it. auto op = ops_.begin() + s;
VLOG(3) << "deleting var " << n;
// collect input and output variables from current delete op
std::vector<std::string> cur_vars;
get_vars(op, cur_vars);
// remove current op
ops_.erase(ops_.begin() + s);
// collect input and output variables from other ops
std::vector<std::string> other_vars;
for (auto it = ops_.begin(); it != ops_.end(); it++) {
get_vars(it, other_vars);
}
// variables should be deleted
std::vector<std::string> delete_vars;
// delete_vars = cur_vars - cur_vars ^ other_input_vars
std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(),
other_vars.end(),
std::inserter(delete_vars, delete_vars.end()));
// remove variables
for (size_t i = 0; i < delete_vars.size(); i++) {
auto name = delete_vars[i];
auto it = vars_.find(name);
PADDLE_ENFORCE(it != vars_.end(),
"%s is not in variable list, it should not be deleted",
name);
vars_.erase(it);
VLOG(3) << "deleting variable " << name;
} }
} }
ops_.erase(ops_.begin() + s, ops_.begin() + e);
} }
std::vector<OpDesc *> BlockDesc::AllOps() const { std::vector<OpDesc *> BlockDesc::AllOps() const {
......
...@@ -89,6 +89,11 @@ class BlockDesc { ...@@ -89,6 +89,11 @@ class BlockDesc {
OpDesc *InsertOp(size_t index); OpDesc *InsertOp(size_t index);
/*
* Remove Op and its input/output variables.
* Note that for either input or ouput variable, if it is also an input or
* output variable of other ops, we should remain it.
*/
void RemoveOp(size_t s, size_t e); void RemoveOp(size_t s, size_t e);
std::vector<OpDesc *> AllOps() const; std::vector<OpDesc *> AllOps() const;
......
...@@ -34,7 +34,7 @@ class Channel { ...@@ -34,7 +34,7 @@ class Channel {
public: public:
virtual bool CanSend() = 0; virtual bool CanSend() = 0;
virtual bool CanReceive() = 0; virtual bool CanReceive() = 0;
virtual bool Send(T*) = 0; virtual void Send(T*) = 0;
virtual bool Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
...@@ -84,96 +84,113 @@ class ChannelHolder { ...@@ -84,96 +84,113 @@ class ChannelHolder {
} }
template <typename T> template <typename T>
bool Send(T* data) { void Send(T* data) {
if (!IsInitialized()) return false; PADDLE_ENFORCE_EQ(IsInitialized(), true,
PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); "The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
// Static cast should be safe because we have ensured that types are same // Static cast should be safe because we have ensured that types are same
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
return channel != nullptr ? channel->Send(data) : false; PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
channel->Send(data);
} }
template <typename T> template <typename T>
bool Receive(T* data) { bool Receive(T* data) {
if (!IsInitialized()) return false; PADDLE_ENFORCE_EQ(IsInitialized(), true,
PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); "The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
return channel != nullptr ? channel->Receive(data) : false; PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
return channel->Receive(data);
} }
bool IsClosed() { bool IsClosed() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->IsClosed(); return holder_->IsClosed();
} }
return false;
}
bool CanSend() { bool CanSend() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->CanSend(); return holder_->CanSend();
} }
return false;
}
bool CanReceive() { bool CanReceive() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->CanReceive(); return holder_->CanReceive();
} }
return false;
}
void close() { void close() {
if (IsInitialized()) holder_->Close(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Close();
} }
size_t Cap() { size_t Cap() {
if (IsInitialized()) return holder_->Cap(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
return -1; "The Channel hasn't been initialized");
return holder_->Cap();
} }
void Lock() { void Lock() {
if (IsInitialized()) holder_->Lock(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Lock();
} }
void Unlock() { void Unlock() {
if (IsInitialized()) holder_->Unlock(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Unlock();
} }
template <typename T> template <typename T>
void AddToSendQ(const void* referrer, T* data, void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) { std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) { if (channel != nullptr) {
channel->AddToSendQ(referrer, data, cond, cb); channel->AddToSendQ(referrer, data, cond, cb);
} }
} }
}
template <typename T> template <typename T>
void AddToReceiveQ(const void* referrer, T* data, void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) { std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) { if (channel != nullptr) {
channel->AddToReceiveQ(referrer, data, cond, cb); channel->AddToReceiveQ(referrer, data, cond, cb);
} }
} }
}
void RemoveFromSendQ(const void* referrer) { void RemoveFromSendQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromSendQ(referrer); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromSendQ(referrer);
} }
void RemoveFromReceiveQ(const void* referrer) { void RemoveFromReceiveQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromReceiveQ(referrer);
} }
inline bool IsInitialized() const { return holder_ != nullptr; } inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() { inline const std::type_index Type() {
PADDLE_ENFORCE_EQ(IsInitialized(), true); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->Type(); return holder_->Type();
} }
......
...@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
public: public:
virtual bool CanSend(); virtual bool CanSend();
virtual bool CanReceive(); virtual bool CanReceive();
virtual bool Send(T *); virtual void Send(T *);
virtual bool Receive(T *); virtual bool Receive(T *);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Lock(); virtual void Lock();
...@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> {
} }
}; };
bool send_return(bool value) { void send_return() {
send_ctr--; send_ctr--;
destructor_cond_.notify_all(); destructor_cond_.notify_all();
return value;
} }
bool recv_return(bool value) { bool recv_return(bool value) {
...@@ -88,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -88,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> {
return value; return value;
} }
std::shared_ptr<QueueMessage> get_first_message(
std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
while (!queue.empty()) {
// Check whether this message was added by Select
// If this was added by Select then execute the callback
// to check if you can execute this message. The callback
// can return false if some other case was executed in Select.
// In that case just discard this QueueMessage and process next.
std::shared_ptr<QueueMessage> m = queue.front();
queue.pop_front();
if (m->callback == nullptr || m->callback(action)) return m;
}
return nullptr;
}
size_t cap_; size_t cap_;
std::recursive_mutex mu_; std::recursive_mutex mu_;
bool closed_; bool closed_;
...@@ -118,45 +132,33 @@ bool ChannelImpl<T>::CanReceive() { ...@@ -118,45 +132,33 @@ bool ChannelImpl<T>::CanReceive() {
} }
template <typename T> template <typename T>
bool ChannelImpl<T>::Send(T *item) { void ChannelImpl<T>::Send(T *item) {
send_ctr++; send_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_}; std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed, do nothing // If channel is closed, throw exception
if (closed_) { if (closed_) {
send_return();
lock.unlock(); lock.unlock();
// TODO(abhinavarora) Should panic on closed channel PADDLE_THROW("Cannot send on closed channel");
return send_return(false);
} }
// If there is a receiver, directly pass the value we want // If there is a receiver, directly pass the value we want
// to send to the receiver, bypassing the channel buffer if any // to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) { if (!recvq.empty()) {
std::shared_ptr<QueueMessage> m = recvq.front(); std::shared_ptr<QueueMessage> m =
recvq.pop_front(); get_first_message(recvq, ChannelAction::SEND);
// Do the data transfer
// We will do this data transfer if either of the following if (m != nullptr) {
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_send = true;
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send)
*(m->data) = std::move(*item); *(m->data) = std::move(*item);
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Send function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return send_return(Send(item));
// Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); send_return();
return send_return(true); return;
} else {
Send(item);
send_return();
return;
}
} }
// Unbuffered channel will always bypass this // Unbuffered channel will always bypass this
...@@ -165,9 +167,8 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -165,9 +167,8 @@ bool ChannelImpl<T>::Send(T *item) {
if (buf_.size() < cap_) { if (buf_.size() < cap_) {
// Copy to buffer // Copy to buffer
buf_.push_back(std::move(*item)); buf_.push_back(std::move(*item));
// Release lock and return true send_return();
lock.unlock(); return;
return send_return(true);
} }
// Block on channel, because some receiver will complete // Block on channel, because some receiver will complete
...@@ -175,8 +176,12 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -175,8 +176,12 @@ bool ChannelImpl<T>::Send(T *item) {
auto m = std::make_shared<QueueMessage>(item); auto m = std::make_shared<QueueMessage>(item);
sendq.push_back(m); sendq.push_back(m);
m->Wait(lock); m->Wait(lock);
// TODO(abhinavarora) Should panic on closed channel if (m->chan_closed) {
return send_return(!m->chan_closed); send_return();
lock.unlock();
PADDLE_THROW("Cannot send on closed channel");
}
send_return();
} }
template <typename T> template <typename T>
...@@ -186,39 +191,37 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -186,39 +191,37 @@ bool ChannelImpl<T>::Receive(T *item) {
// If channel is closed and buffer is empty or // If channel is closed and buffer is empty or
// channel is unbuffered // channel is unbuffered
if (closed_ && buf_.empty()) { if (closed_ && buf_.empty()) return recv_return(false);
lock.unlock();
return recv_return(false);
}
// If there is a sender, directly receive the value we want // If there is a sender, directly receive the value we want
// from the sender, bypassing the channel buffer if any // from the sender. In case of a buffered channel, read from
// buffer and move front of send queue to the buffer
if (!sendq.empty()) { if (!sendq.empty()) {
std::shared_ptr<QueueMessage> m = sendq.front(); std::shared_ptr<QueueMessage> m =
sendq.pop_front(); get_first_message(sendq, ChannelAction::RECEIVE);
// Do the data transfer if (buf_.size() > 0) {
// We will do this data transfer if either of the following // Case 1 : Channel is Buffered
// cases are true // Do Data transfer from front of buffer
// 1. callback == nullptr // This means it was a regular channel send // and add a QueueMessage to the buffer
// 2. callback returns true *item = std::move(buf_.front());
bool do_receive = true; buf_.pop_front();
if (m->callback != nullptr) // If first message from sendq is not null
do_receive = m->callback(ChannelAction::RECEIVE); // add it to the buffer and notify it
if (do_receive) if (m != nullptr) {
// Copy to buffer
buf_.push_back(std::move(*(m->data)));
m->Notify();
} // Ignore if there is no first message
} else {
// Case 2: Channel is Unbuffered
// Do data transfer from front of SendQ
// If front is nullptr, then recursively call itself
if (m != nullptr) {
*item = std::move(*(m->data)); *item = std::move(*(m->data));
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Receive function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return recv_return(Receive(item));
// Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); } else
return recv_return(Receive(item));
}
return recv_return(true); return recv_return(true);
} }
...@@ -227,8 +230,7 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -227,8 +230,7 @@ bool ChannelImpl<T>::Receive(T *item) {
// Directly read from buffer // Directly read from buffer
*item = std::move(buf_.front()); *item = std::move(buf_.front());
buf_.pop_front(); buf_.pop_front();
// Release lock and return true // return true
lock.unlock();
return recv_return(true); return recv_return(true);
} }
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <chrono> #include <chrono>
#include <thread> #include <thread>
#include "gtest/gtest.h" #include "gtest/gtest.h"
using paddle::framework::Channel; using paddle::framework::Channel;
...@@ -37,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) { ...@@ -37,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) {
delete ch; delete ch;
} }
void RecevingOrderEqualToSendingOrder(Channel<int> *ch) { void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < num_items; i++) {
EXPECT_EQ(ch->Send(&i), true); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
for (int i = 0; i < 5; i++) { std::this_thread::sleep_for(std::chrono::milliseconds(200));
int recv = 999; for (int i = 0; i < num_items; i++) {
int recv = -1;
EXPECT_EQ(ch->Receive(&recv), true); EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i); EXPECT_EQ(recv, i);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200)); std::this_thread::sleep_for(std::chrono::milliseconds(200));
CloseChannel(ch); CloseChannel(ch);
t.join(); t.join();
EXPECT_EQ(sum_send, 10U); unsigned expected_sum = (num_items * (num_items - 1)) / 2;
EXPECT_EQ(sum_send, expected_sum);
delete ch; delete ch;
} }
...@@ -61,7 +62,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) { ...@@ -61,7 +62,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10; const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // should not block ch->Send(&i);
} }
size_t out; size_t out;
...@@ -82,7 +83,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) { ...@@ -82,7 +83,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
const size_t data = 5; const size_t data = 5;
std::thread send_thread{[&]() { std::thread send_thread{[&]() {
size_t i = data; size_t i = data;
EXPECT_EQ(ch->Send(&i), true); // should not block ch->Send(&i); // should not block
}}; }};
std::thread recv_thread{[&]() { std::thread recv_thread{[&]() {
...@@ -94,12 +95,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) { ...@@ -94,12 +95,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
send_thread.join(); send_thread.join();
recv_thread.join(); recv_thread.join();
// After closing send should return false. Receive should // After closing send should panic. Receive should
// also return false as there is no data in queue. // also false as there is no data in queue.
CloseChannel(ch); CloseChannel(ch);
send_thread = std::thread{[&]() { send_thread = std::thread{[&]() {
size_t i = data; size_t i = data;
EXPECT_EQ(ch->Send(&i), false); // should return false bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}}; }};
recv_thread = std::thread{[&]() { recv_thread = std::thread{[&]() {
size_t i; size_t i;
...@@ -129,7 +136,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { ...@@ -129,7 +136,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // sending should not block ch->Send(&i); // sending should not block
} }
size_t out; size_t out;
...@@ -160,9 +167,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -160,9 +167,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
// Try to write more than buffer size. // Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) { for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size) if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations ch->Send(&i); // should block after 10 iterations
else else {
EXPECT_EQ(ch->Send(&i), false); bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}
} }
}); });
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
...@@ -173,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -173,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
auto ch = MakeChannel<int>(0); auto ch = MakeChannel<int>(0);
RecevingOrderEqualToSendingOrder(ch); RecevingOrderEqualToSendingOrder(ch, 20);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
// Test that Receive Order is same as Send Order when number of items
// sent is less than size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 5);
} }
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) { TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
// Test that Receive Order is same as Send Order when number of items
// sent is equal to size of buffer
auto ch = MakeChannel<int>(10); auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch); RecevingOrderEqualToSendingOrder(ch, 10);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
// Test that Receive Order is same as Send Order when number of items
// sent is greater than the size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 20);
} }
void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) { void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
...@@ -231,7 +261,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) { ...@@ -231,7 +261,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -316,8 +352,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { ...@@ -316,8 +352,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
// Try to send more number of times // Try to send more number of times
// than receivers // than receivers
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
try {
ch->Send(&i); ch->Send(&i);
sum_send += i; sum_send += i;
} catch (paddle::platform::EnforceNotMet e) {
}
} }
}); });
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
...@@ -382,7 +421,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { ...@@ -382,7 +421,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -508,7 +553,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) { ...@@ -508,7 +553,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
...@@ -541,8 +586,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) { ...@@ -541,8 +586,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
ChannelHolder *ch = new ChannelHolder(); ChannelHolder *ch = new ChannelHolder();
EXPECT_EQ(ch->IsInitialized(), false); EXPECT_EQ(ch->IsInitialized(), false);
int i = 10; int i = 10;
EXPECT_EQ(ch->Send(&i), false); bool send_exception = false;
EXPECT_EQ(ch->Receive(&i), false); try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
send_exception = true;
}
EXPECT_EQ(send_exception, true);
bool recv_exception = false;
try {
ch->Receive(&i);
} catch (paddle::platform::EnforceNotMet e) {
recv_exception = true;
}
EXPECT_EQ(recv_exception, true);
bool is_exception = false; bool is_exception = false;
try { try {
ch->Type(); ch->Type();
...@@ -669,7 +728,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { ...@@ -669,7 +728,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -760,7 +825,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { ...@@ -760,7 +825,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
......
cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
if(WITH_GPU)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
else()
set(multi_devices_graph_builder_deps)
endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/computation_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
platform::Place place)
: op_(framework::OpRegistry::CreateOp(op_desc)),
scope_(scope),
place_(place) {}
void ComputationOpHandle::RunImpl() {
auto *cur_ctx = dev_ctxes_[place_];
for (auto *in : inputs_) {
bool need_wait =
in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
if (need_wait) {
in->generated_op_->Wait(cur_ctx);
}
}
op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
}
std::string ComputationOpHandle::Name() const { return op_->Type(); }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct ComputationOpHandle : public OpHandleBase {
std::unique_ptr<OperatorBase> op_;
Scope *scope_;
platform::Place place_;
ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
platform::Place place);
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
std::vector<Scope *> *local_scopes)
: data_(data), offset_(offset), local_scopes_(local_scopes) {}
FetchOpHandle::~FetchOpHandle() {
for (auto *input_var : inputs_) {
input_var->pending_ops_.erase(this);
}
}
void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
}
void FetchOpHandle::WaitAndMergeCPUTensors() const {
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&t);
}
data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
}
void FetchOpHandle::RunImpl() {
auto cpu_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
for (auto *input : inputs_) {
auto *var = static_cast<VarHandle *>(input);
var->generated_op_->Wait(cpu_ctx);
}
tensors_.resize(inputs_.size());
auto *var = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var->name_;
platform::CPUPlace cpu;
auto &scopes = *local_scopes_;
for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i];
auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
dev_ctxes_[t.place()]->Wait();
#endif
} else {
tensors_[i].ShareDataWith(t);
tensors_[i].set_lod(t.lod());
}
}
this->WaitAndMergeCPUTensors();
}
std::string FetchOpHandle::Name() const { return "Fetch"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct FetchOpHandle : public OpHandleBase {
FeedFetchList *data_;
size_t offset_;
std::vector<Scope *> *local_scopes_;
std::vector<LoDTensor> tensors_;
FetchOpHandle(FeedFetchList *data, size_t offset,
std::vector<Scope *> *local_scopes);
~FetchOpHandle();
void Wait(platform::DeviceContext *waited_dev) override;
void WaitAndMergeCPUTensors() const;
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
namespace paddle {
namespace framework {
namespace details {
#ifdef PADDLE_WITH_CUDA
MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes,
platform::NCCLContextMap *nccl_ctxs)
: loss_var_name_(loss_var_name),
places_(places),
local_scopes_(local_scopes),
nccl_ctxs_(nccl_ctxs) {
#else
MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes)
: loss_var_name_(loss_var_name),
places_(places),
local_scopes_(local_scopes) {
#endif
for (auto &p : params) {
grad_names_.insert(GradVarName(p));
}
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
auto graph = new SSAGraph();
SSAGraph &result = *graph;
result.vars_.resize(places_.size());
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
bool change_forward = false;
if (!is_forwarding) {
// FIXME(yy): Do not hard code like this
if (op->OutputArgumentNames().size() == 1 &&
op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
continue; // Drop fill 1. for backward coeff;
}
}
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i];
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get();
op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
platform::DeviceContextPool::Instance().Get(p));
auto var_names = op->InputArgumentNames();
for (auto &each_var_name : var_names) {
VarHandle *var =
CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
op_handle->AddInput(var);
}
var_names = op->OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(&result, op_handle, each_var_name, p, i);
}
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
communication_dev_ctx);
result.ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// VarHandle *loss = GetVarHandle(loss_var_name, place);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
change_forward = true;
}
}
}
if (change_forward) {
is_forwarding = false;
}
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
for (auto &og : var_names) {
if (grad_names_.count(og) != 0) { // is param grad
// Insert NCCL AllReduce Op
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result.vars_[i][og];
if (vars.empty()) { // This device has no data. continue.
continue;
}
auto *prev_grad = &vars[vars.size() - 1];
op_handle->AddInput(prev_grad);
auto &var = vars[vars.size()];
var.place_ = p;
var.name_ = og;
var.version_ = vars.size() - 1;
op_handle->AddOutput(&var);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
}
}
}
/*
Dependency graph has been constructed. However, there are still data
harzaeds need to be handled.
*/
PolishGraphToSupportDataHazards(&result);
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
PrintGraphviz(*graph, sout);
VLOG(10) << sout.str();
}
return std::unique_ptr<SSAGraph>(graph);
} // namespace details
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle {
namespace platform {
class NCCLContextMap;
}
namespace framework {
class Scope;
namespace details {
class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
public:
#ifdef PADDLE_WITH_CUDA
MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes,
platform::NCCLContextMap *nccl_ctxs);
#else
MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes);
#endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
private:
std::string loss_var_name_;
const std::vector<platform::Place> &places_;
const std::vector<Scope *> &local_scopes_;
std::unordered_set<std::string> grad_names_;
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_;
#endif
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap &ctxs)
: local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
for (auto &p : places_) {
this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
}
}
void NCCLAllReduceOpHandle::RunImpl() {
if (inputs_.size() == 1) {
return; // No need to all reduce when GPU count = 1;
} else {
// Wait input done
for (auto *in : inputs_) {
auto &p = static_cast<VarHandle *>(in)->place_;
in->generated_op_->Wait(dev_ctxes_[p]);
}
auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
int dtype = -1;
size_t numel = 0;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i];
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
void *buffer = const_cast<void *>(lod_tensor.data<void>());
if (dtype == -1) {
dtype = platform::ToNCCLDataType(lod_tensor.type());
}
if (numel == 0) {
numel = static_cast<size_t>(lod_tensor.numel());
}
auto &nccl_ctx = nccl_ctxs_.at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
});
}
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
call();
}
}
}
std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
namespace framework {
namespace details {
struct NCCLAllReduceOpHandle : public OpHandleBase {
const std::vector<Scope *> &local_scopes_;
const std::vector<platform::Place> &places_;
const platform::NCCLContextMap &nccl_ctxs_;
NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap &ctxs);
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h"
namespace paddle {
namespace framework {
namespace details {
std::string OpHandleBase::DebugString() const {
std::stringstream ss;
ss << "(";
for (auto *var : inputs_) {
ss << var->DebugString() << ", ";
}
ss << ") --> (";
for (auto *var : outputs_) {
ss << var->DebugString() << ", ";
}
ss << ")\n";
return ss.str();
}
OpHandleBase::~OpHandleBase() {
#ifdef PADDLE_WITH_CUDA
for (auto &ev : events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second));
}
#endif
}
void OpHandleBase::Run(bool use_event) {
#ifdef PADDLE_WITH_CUDA
if (events_.empty() && use_event) {
for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
}
}
#else
PADDLE_ENFORCE(!use_event);
#endif
RunImpl();
#ifdef PADDLE_WITH_CUDA
if (use_event) {
for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
auto stream =
static_cast<platform::CUDADeviceContext *>(p.second)->stream();
PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
}
}
#endif
}
void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) {
dev_ctx.second->Wait();
}
} else {
auto stream =
static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
}
}
#else
for (auto &dev_ctx : dev_ctxes_) {
dev_ctx.second->Wait();
}
#endif
}
void OpHandleBase::AddInput(VarHandleBase *in) {
this->inputs_.emplace_back(in);
in->pending_ops_.insert(this);
}
void OpHandleBase::AddOutput(VarHandleBase *out) {
outputs_.emplace_back(out);
out->generated_op_ = this;
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace framework {
namespace details {
class OpHandleBase {
private:
DISABLE_COPY_AND_ASSIGN(OpHandleBase);
public:
std::vector<VarHandleBase *> inputs_;
std::vector<VarHandleBase *> outputs_;
std::unordered_map<platform::Place, platform::DeviceContext *,
platform::PlaceHash>
dev_ctxes_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_;
#endif
OpHandleBase() {}
std::string DebugString() const;
virtual std::string Name() const = 0;
virtual ~OpHandleBase();
void Run(bool use_event);
virtual void Wait(platform::DeviceContext *waited_dev);
void AddInput(VarHandleBase *in);
void AddOutput(VarHandleBase *out);
protected:
virtual void RunImpl() = 0;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
platform::Place place,
platform::DeviceContext *dev_ctx)
: coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
dev_ctxes_[place_] = dev_ctx;
}
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
void ScaleLossGradOpHandle::RunImpl() {
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
float *tmp =
scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
make_ddim({1}), place_);
if (platform::is_cpu_place(place_)) {
*tmp = coeff_;
} else {
#ifdef PADDLE_WITH_CUDA
auto stream =
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
#endif
}
}
std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
struct ScaleLossGradOpHandle : public OpHandleBase {
float coeff_;
Scope *scope_;
platform::Place place_;
ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
platform::DeviceContext *context);
~ScaleLossGradOpHandle() final;
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph.h"
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
namespace paddle {
namespace framework {
namespace details {
struct SSAGraph {
std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_;
// aux variables to represent dependency. Useful to resolve data hazard.
std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
std::vector<std::unique_ptr<OpHandleBase>> ops_;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle {
namespace framework {
namespace details {
void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
for (auto &var_map : graph->vars_) {
for (auto &name_pair : var_map) {
if (name_pair.second.size() <= 1) {
continue;
}
auto it_new = name_pair.second.rbegin();
auto it_old = name_pair.second.rbegin();
++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
auto *write_op = it_new->second.generated_op_;
auto &read_ops = it_old->second.pending_ops_;
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
// Read Write is the same op.
continue;
}
auto *dep_var = new DummyVarHandle();
read_op->AddOutput(dep_var);
write_op->AddInput(dep_var);
graph->dep_vars_.emplace(dep_var);
}
}
}
}
}
VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
SSAGraph *graph, const std::string &each_var_name,
const platform::Place &place, size_t place_offset) {
auto &var_holders = graph->vars_[place_offset];
auto &var_holder = var_holders[each_var_name];
VarHandle *var = nullptr;
if (var_holder.empty()) {
auto &init_var = var_holder[0];
init_var.place_ = place;
init_var.name_ = each_var_name;
init_var.generated_op_ = nullptr;
init_var.version_ = 0;
var = &init_var;
} else {
var = &var_holder.rbegin()->second;
}
return var;
}
void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
const std::string &each_var_name,
const platform::Place &place,
size_t place_offset) {
auto &vars = graph->vars_[place_offset][each_var_name];
size_t version = vars.size();
auto &var = vars[version];
var.version_ = version;
var.name_ = each_var_name;
var.place_ = place;
op_handle->AddOutput(&var);
}
template <typename Callback>
void IterAllVar(const SSAGraph &graph, Callback callback) {
for (auto &each : graph.vars_) {
for (auto &pair1 : each) {
for (auto &pair2 : pair1.second) {
callback(pair2.second);
}
}
}
for (auto &var : graph.dep_vars_) {
callback(*var);
}
}
void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
size_t var_id = 0;
std::unordered_map<const VarHandleBase *, size_t> vars;
sout << "digraph G {\n";
IterAllVar(graph, [&](const VarHandleBase &var) {
auto *var_ptr = &var;
auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
size_t cur_var_id = var_id++;
vars[var_ptr] = cur_var_id;
if (var_handle_ptr) {
sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
<< "\\n"
<< var_handle_ptr->place_ << "\\n"
<< var_handle_ptr->version_ << "\"]" << std::endl;
} else if (dummy_ptr) {
sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
}
});
size_t op_id = 0;
for (auto &op : graph.ops_) {
std::string op_name = "op_" + std::to_string(op_id++);
sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
<< std::endl;
for (auto in : op->inputs_) {
std::string var_name = "var_" + std::to_string(vars[in]);
sout << var_name << " -> " << op_name << std::endl;
}
for (auto out : op->outputs_) {
std::string var_name = "var_" + std::to_string(vars[out]);
sout << op_name << " -> " << var_name << std::endl;
}
}
sout << "}\n";
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h"
#include <memory>
#include <string>
namespace paddle {
namespace framework {
namespace details {
class SSAGraphBuilder {
public:
SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {}
virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
protected:
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
static void PolishGraphToSupportDataHazards(SSAGraph *graph);
static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
const std::string &each_var_name,
const platform::Place &place,
size_t place_offset);
static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
const std::string &each_var_name,
const platform::Place &place, size_t place_offset);
static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
namespace paddle {
namespace framework {
namespace details {
SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
: graph_(std::move(graph)) {}
SSAGraphExecutor::~SSAGraphExecutor() {}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
namespace details {
class SSAGraphExecutor {
DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
public:
// Steal graph inside
explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
virtual ~SSAGraphExecutor();
virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
protected:
std::unique_ptr<SSAGraph> graph_;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
size_t num_threads, bool use_event,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<SSAGraph> &&graph)
: SSAGraphExecutor(std::move(graph)),
pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
local_scopes_(local_scopes),
places_(places),
fetch_ctxs_(places),
use_event_(use_event) {}
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
pending_vars.insert(&var);
if (var.generated_op_ == nullptr) {
ready_vars.Push(&var);
}
};
auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
pending_ops.insert({&op_instance, op_instance.inputs_.size()});
};
// Transform SSAGraph to pending_ops & pending_vars
for (auto &var_map : graph_->vars_) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
InsertPendingVar(version_pair.second);
}
}
}
for (auto &var : graph_->dep_vars_) {
InsertPendingVar(*var);
}
for (auto &op : graph_->ops_) {
if (op->inputs_.empty()) { // Special case, Op has no input.
ready_ops.insert(op.get());
} else {
InsertPendingOp(*op);
}
}
// Step 2. Insert FetchOps
std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
std::vector<DummyVarHandle> dummy_vars;
FeedFetchList fetch_data(fetch_tensors.size());
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->vars_) {
auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second);
}
}
}
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
auto &var_name = fetch_tensors[i];
auto &vars = fetched_vars.at(var_name);
auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
fetch_ops.emplace_back(op);
// FIXME: Use new device context
for (auto &p : places_) {
op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
}
for (auto *var : vars) {
op->AddInput(var);
}
InsertPendingOp(*op);
}
auto run_all_ready_ops = [&] {
for (auto *op : ready_ops) {
RunOp(ready_vars, op);
}
ready_ops.clear();
};
// Create local scopes.
for (auto &scope : local_scopes_) {
auto &local_scope = scope->NewScope();
*scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
}
// Step 3. Execution
while (!pending_vars.empty()) {
// 1. Run All Ready ops
run_all_ready_ops();
// 2. Find ready variable
bool timeout;
auto cur_ready_vars = ready_vars.PopAll(1000, &timeout);
if (timeout) {
if (exception_) {
throw * exception_;
} else {
continue;
}
}
// 3. Remove the dependency of ready_var.
// Find the ready_ops after the ready_var.
for (auto ready_var : cur_ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->pending_ops_) {
auto &deps = pending_ops[op];
--deps;
if (deps == 0) {
ready_ops.insert(op);
}
}
}
// Keep loop until all vars are ready.
}
++computation_count_;
auto sync_computation = [&] {
computation_count_ = 0;
// Wait All computational streams
for (auto p : this->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
for (auto &scope : local_scopes_) {
scope->DropKids();
}
};
// Wait FetchOps.
if (!fetch_ops.empty()) {
fetch_ops.clear();
sync_computation();
}
if (computation_count_ == max_async_computation) {
sync_computation();
}
// NOTE: the temp scope can be dropped lazily if needed.
// Drop tmp scopes;
for (auto &scope : local_scopes_) {
auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
kid = nullptr;
}
return fetch_data;
}
void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue<VarHandleBase *> &ready_var_q, details::OpHandleBase *op) {
auto op_run = [&ready_var_q, op, this] {
try {
VLOG(10) << op->Name() << " : " << op->DebugString();
op->Run(use_event_);
ready_var_q.Extend(op->outputs_);
} catch (platform::EnforceNotMet ex) {
exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) {
LOG(FATAL) << "Unknown exception catched";
}
};
if (pool_) {
pool_->enqueue(op_run);
} else {
op_run();
}
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
namespace paddle {
namespace framework {
class Scope;
namespace details {
template <typename T>
class BlockingQueue {
public:
void Push(const T &item) {
{
std::lock_guard<std::mutex> g(mutex_);
q_.emplace_back(item);
}
cv_.notify_one();
}
template <typename U>
void Extend(const U &items) {
{
std::lock_guard<std::mutex> g(mutex_);
for (auto &item : items) {
q_.emplace_back(item);
}
}
cv_.notify_all();
}
std::deque<T> PopAll(size_t ms, bool *timeout) {
auto time =
std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
std::unique_lock<std::mutex> lock(mutex_);
*timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
std::deque<T> ret;
if (!*timeout) {
std::swap(ret, q_);
}
return ret;
}
private:
std::mutex mutex_;
std::condition_variable cv_;
std::deque<T> q_;
};
class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
public:
ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<SSAGraph> &&graph);
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
~ThreadedSSAGraphExecutor() {}
private:
void RunOp(BlockingQueue<VarHandleBase *> &ready_var_q,
details::OpHandleBase *op);
private:
std::unique_ptr<::ThreadPool> pool_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_;
const bool use_event_;
std::unique_ptr<platform::EnforceNotMet> exception_;
size_t computation_count_{0};
size_t max_async_computation{100};
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/var_handle.h"
namespace paddle {
namespace framework {
namespace details {
VarHandleBase::~VarHandleBase() {}
std::string VarHandle::DebugString() const {
std::stringstream ss;
ss << name_ << ":" << place_;
return ss.str();
}
std::string DummyVarHandle::DebugString() const { return "dummy"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sstream>
#include <string>
#include <unordered_set>
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace details {
struct OpHandleBase;
// VarHandleBase is the var node in the dependency graph.
// A variable can only be generated by a single operator. i.e.
// This is a single assignment graph.
struct VarHandleBase {
virtual ~VarHandleBase();
virtual std::string DebugString() const = 0;
// The operator who generate this variable. nullptr if the variable
// is a root node.
OpHandleBase *generated_op_;
// Operators which depend on this variable ready.
std::unordered_set<OpHandleBase *> pending_ops_;
};
// VarHandle is actually a single version of Runtime Variable.
// Variable in Runtime mapped to many VarHandles in Graph.
// Each assignment will generate a new var handle with newer version.
//
// NOTE: runtime variables have place.
struct VarHandle : public VarHandleBase {
std::string DebugString() const override;
// version field currently is not used, however, just store the version to
// debug easily.
size_t version_;
std::string name_;
platform::Place place_;
};
// Dummy Variable. It is used to represent dependencies between operators
struct DummyVarHandle : public VarHandleBase {
std::string DebugString() const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -46,7 +46,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { ...@@ -46,7 +46,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarType::Type var_type) { void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) { if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>(); var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) { } else if (var_type == proto::VarType::SELECTED_ROWS) {
...@@ -309,12 +309,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -309,12 +309,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
if (var->Persistable()) { if (var->Persistable()) {
auto* ptr = scope->Var(var->Name()); auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr; << " global, which pointer is " << ptr;
} else { } else {
auto* ptr = local_scope->Var(var->Name()); auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr; << " locally, which pointer is " << ptr;
} }
...@@ -322,7 +322,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -322,7 +322,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} else { } else {
for (auto& var : block.AllVars()) { for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name()); auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is " VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr; << ptr;
} }
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
struct ExecutorPrepareContext { struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
......
...@@ -517,6 +517,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -517,6 +517,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transform // do data transform
Scope& new_scope = scope.NewScope(); Scope& new_scope = scope.NewScope();
std::vector<std::string> inplace_vars;
for (auto& var_name_item : this->Inputs()) { for (auto& var_name_item : this->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name); auto* var = scope.FindVar(var_name);
...@@ -529,10 +530,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -529,10 +530,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto out_var_names = OutputVars(true); auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(), if (std::find(out_var_names.begin(), out_var_names.end(),
var_name) != out_var_names.end()) { var_name) != out_var_names.end()) {
PADDLE_THROW( inplace_vars.push_back(var_name);
"var %s is both input and output, "
"does not support transform",
var_name);
} }
VLOG(3) << "Transform Variable " << var_name << " from " VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key; << kernel_type_for_var << " to " << expected_kernel_key;
...@@ -551,6 +549,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -551,6 +549,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
kernel_iter->second->Compute( kernel_iter->second->Compute(
ExecutionContext(*this, new_scope, *new_dev_ctx)); ExecutionContext(*this, new_scope, *new_dev_ctx));
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
}
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); new_dev_ctx->Wait();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include <string>
#include "ThreadPool.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
namespace paddle {
namespace framework {
class ParallelExecutorPrivate {
public:
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
: places_(places) {}
std::vector<platform::Place> places_;
std::vector<Scope *> local_scopes_;
Scope *global_scope_;
std::unique_ptr<details::SSAGraphExecutor> executor_;
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
};
ParallelExecutor::ParallelExecutor(
size_t num_threads, bool use_event,
const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const ProgramDesc &main_program,
const std::string &loss_var_name, Scope *scope)
: member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope;
// Step 1. RunStartupProgram and Bcast the params to devs.
Executor exe(places[0]);
exe.Run(startup_program, scope, 0);
// Create local scopes
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope());
}
// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
#endif
if (platform::is_gpu_place(places[0]) &&
member_->local_scopes_.size() != 1) { // Is CUDA
BCastParamsToGPUs(startup_program);
}
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#ifdef PADDLE_WITH_CUDA
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
params, member_->local_scopes_,
member_->nccl_ctxs_.get());
#else
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
params, member_->local_scopes_);
#endif
auto graph = builder.Build(main_program);
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
num_threads, use_event, member_->local_scopes_, places,
std::move(graph)));
// Step 3. Create vars in each scope;
for (auto *scope : member_->local_scopes_) {
for (auto *var : main_program.Block(0).AllVars()) {
if (scope->FindVar(var->Name()) != nullptr) {
continue;
}
InitializeVariable(scope->Var(var->Name()), var->GetType());
}
}
}
void ParallelExecutor::BCastParamsToGPUs(
const ProgramDesc &startup_program) const {
#ifdef PADDLE_WITH_CUDA
auto *main_scope = member_->local_scopes_[0];
for (auto *var_desc : startup_program.Block(0).AllVars()) {
size_t idx = var_desc->Name().find("@GRAD");
if (idx != std::string::npos) continue;
if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
auto &main_tensor =
main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) {
size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i];
void *buffer;
if (i == 0) {
buffer = const_cast<void *>(main_tensor.data<void>());
} else {
auto local_scope = member_->local_scopes_[i];
auto *t =
local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type());
}
auto &nccl_ctx = member_->nccl_ctxs_->at(place);
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
}
} else {
platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
t->Resize(dims);
t->mutable_data(cpu, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t);
}
}
}
member_->nccl_ctxs_->WaitAll();
}
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
}
void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name) {
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
fetch_data;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <future>
#include <unordered_set>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class ParallelExecutorPrivate;
class ParallelExecutor {
DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
public:
explicit ParallelExecutor(size_t num_threads, bool use_event,
const std::vector<platform::Place>& places,
const std::unordered_set<std::string>& params,
const ProgramDesc& startup_program,
const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope);
void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name = "fetched_var");
private:
ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
};
} // namespace framework
} // namespace paddle
...@@ -29,7 +29,7 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) { ...@@ -29,7 +29,7 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
PADDLE_ENFORCE_EQ(actual.size(), expect.size()); PADDLE_ENFORCE_EQ(actual.size(), expect.size());
for (int j = 0; j < actual.size(); ++j) { for (int j = 0; j < actual.size(); ++j) {
PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1); // PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
} }
} }
} }
......
...@@ -45,10 +45,11 @@ class Tensor { ...@@ -45,10 +45,11 @@ class Tensor {
friend struct EigenVector; friend struct EigenVector;
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0), is_pinned_(false) {}
/*! Constructor with place should only be used in pybind. */ /*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) { explicit Tensor(const platform::Place& place)
: offset_(0), is_pinned_(false) {
holder_->set_place(place); holder_->set_place(place);
} }
...@@ -69,11 +70,12 @@ class Tensor { ...@@ -69,11 +70,12 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(platform::Place place); inline T* mutable_data(platform::Place place, bool is_pinned = false);
inline void* mutable_data(platform::Place place, std::type_index type); inline void* mutable_data(platform::Place place, std::type_index type,
bool is_pinned = false);
inline void* mutable_data(platform::Place place); inline void* mutable_data(platform::Place place, bool is_pinned = false);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
...@@ -84,7 +86,8 @@ class Tensor { ...@@ -84,7 +86,8 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(DDim dims, platform::Place place); inline T* mutable_data(DDim dims, platform::Place place,
bool is_pinned = false);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
inline const DDim& dims() const; inline const DDim& dims() const;
...@@ -92,6 +95,9 @@ class Tensor { ...@@ -92,6 +95,9 @@ class Tensor {
/*! Return the numel of the memory block. */ /*! Return the numel of the memory block. */
inline int64_t numel() const; inline int64_t numel() const;
/*! Return the numel of the memory block. */
inline bool isPinned() const;
/*! Resize the dimensions of the memory block. */ /*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims); inline Tensor& Resize(const DDim& dims);
...@@ -146,12 +152,14 @@ class Tensor { ...@@ -146,12 +152,14 @@ class Tensor {
template <typename Place> template <typename Place>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type) PlaceholderImpl(Place place, size_t size, std::type_index type,
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)), bool is_pinned = false)
memory::PODDeleter<uint8_t, Place>(place)), : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
place_(place), place_(place),
size_(size), size_(size),
type_(type) { type_(type),
is_pinned_(is_pinned) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU")); (is_cpu_place(place_) ? "CPU" : "GPU"));
} }
...@@ -174,6 +182,9 @@ class Tensor { ...@@ -174,6 +182,9 @@ class Tensor {
/* the current type of memory */ /* the current type of memory */
std::type_index type_; std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
}; };
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
...@@ -208,6 +219,7 @@ class Tensor { ...@@ -208,6 +219,7 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins. * PlaceHolder::ptr_ and where the tensor data really begins.
*/ */
size_t offset_; size_t offset_;
bool is_pinned_;
}; };
inline void Tensor::switch_place(platform::Place new_place) { inline void Tensor::switch_place(platform::Place new_place) {
......
...@@ -101,25 +101,27 @@ inline T* Tensor::data() { ...@@ -101,25 +101,27 @@ inline T* Tensor::data() {
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place) { inline T* Tensor::mutable_data(DDim dims, platform::Place place,
bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place); return mutable_data<T>(place, is_pinned);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place) { inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T))); return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
} }
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
bool is_pinned) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
holder_->set_type(type); holder_->set_type(type);
} }
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GE(numel(), 0,
numel(), 0, "When calling this method, the Tensor's numel must be "
"When calling this method, the Tensor's numel must be larger than zero. " "equal or larger than zero. "
"Please check Tensor::Resize has been called first."); "Please check Tensor::Resize has been called first.");
int64_t size = numel() * SizeOfType(type); int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
...@@ -127,26 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -127,26 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>( holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type)); boost::get<platform::CPUPlace>(place), size, type, is_pinned));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type)); boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
} }
#endif #endif
offset_ = 0; offset_ = 0;
is_pinned_ = is_pinned;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
inline void* Tensor::mutable_data(platform::Place place) { inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing"); "Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type()); return mutable_data(place, holder_->type(), is_pinned);
} }
inline Tensor& Tensor::ShareDataWith(const Tensor& src) { inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
...@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; } ...@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); } inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res; Tensor res;
res.ShareDataWith(src); res.ShareDataWith(src);
......
...@@ -32,6 +32,8 @@ namespace framework { ...@@ -32,6 +32,8 @@ namespace framework {
// number of threads. // number of threads.
class ThreadPool { class ThreadPool {
public: public:
explicit ThreadPool(int num_threads);
using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>; using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
// Returns the singleton of ThreadPool. // Returns the singleton of ThreadPool.
...@@ -103,8 +105,6 @@ class ThreadPool { ...@@ -103,8 +105,6 @@ class ThreadPool {
DISABLE_COPY_AND_ASSIGN(ThreadPool); DISABLE_COPY_AND_ASSIGN(ThreadPool);
explicit ThreadPool(int num_threads);
// If the task queue is empty and avaialbe is equal to the number of // If the task queue is empty and avaialbe is equal to the number of
// threads, means that all tasks are completed. Note: this function // threads, means that all tasks are completed. Note: this function
// is not thread-safe. Returns true if all tasks are completed. // is not thread-safe. Returns true if all tasks are completed.
......
...@@ -79,7 +79,18 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { ...@@ -79,7 +79,18 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
// if size is 0. We just make sure it does. // if size is 0. We just make sure it does.
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
void* p; void* p;
int prev_id;
cudaGetDevice(&prev_id);
if (prev_id != gpu_id_) {
cudaSetDevice(gpu_id_);
}
cudaError_t result = cudaMalloc(&p, size); cudaError_t result = cudaMalloc(&p, size);
if (prev_id != gpu_id_) {
cudaSetDevice(prev_id);
}
if (result == cudaSuccess) { if (result == cudaSuccess) {
index = 0; index = 0;
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
...@@ -119,6 +130,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -119,6 +130,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
bool GPUAllocator::UseGpu() const { return true; } bool GPUAllocator::UseGpu() const { return true; }
// PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
// PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
return p;
}
return nullptr;
}
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFreeHost after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
}
}
bool CUDAPinnedAllocator::UseGpu() const { return true; }
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -43,6 +43,8 @@ class CPUAllocator : public SystemAllocator { ...@@ -43,6 +43,8 @@ class CPUAllocator : public SystemAllocator {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class GPUAllocator : public SystemAllocator { class GPUAllocator : public SystemAllocator {
public: public:
explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
...@@ -50,6 +52,19 @@ class GPUAllocator : public SystemAllocator { ...@@ -50,6 +52,19 @@ class GPUAllocator : public SystemAllocator {
private: private:
size_t gpu_alloc_size_ = 0; size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0; size_t fallback_alloc_size_ = 0;
int gpu_id_;
};
class CUDAPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
}; };
#endif #endif
......
...@@ -58,7 +58,7 @@ TEST(CPUAllocator, LockMem) { ...@@ -58,7 +58,7 @@ TEST(CPUAllocator, LockMem) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(GPUAllocator, Alloc) { TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a; paddle::memory::detail::GPUAllocator a(0);
TestAllocator(a, 2048); TestAllocator(a, 2048);
TestAllocator(a, 0); TestAllocator(a, 0);
} }
......
...@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() { ...@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
} }
template <> template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool is_pinned) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size); void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p; VLOG(10) << " pointer=" << p;
...@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { ...@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) { void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool is_pinned) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
} }
...@@ -69,7 +71,31 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -69,7 +71,31 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
} }
platform::SetDeviceId(gpu_id); platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) { if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator, as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
return as[gpu_id];
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(), platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize()); platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use " VLOG(10) << "\n\nNOTE: each GPU device use "
...@@ -88,9 +114,17 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) { ...@@ -88,9 +114,17 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
} }
template <> template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
bool is_pinned) {
void* ptr;
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size); ptr = buddy_allocator->Alloc(size);
}
if (ptr == nullptr) { if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId(); int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
...@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { ...@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
} }
template <> template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) { void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
bool is_pinned) {
if (is_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
}
} }
#endif #endif
......
...@@ -33,7 +33,7 @@ namespace memory { ...@@ -33,7 +33,7 @@ namespace memory {
* address is valid or not. * address is valid or not.
*/ */
template <typename Place> template <typename Place>
void* Alloc(Place place, size_t size); void* Alloc(Place place, size_t size, bool is_pinned = false);
/** /**
* \brief Free memory block in one place. * \brief Free memory block in one place.
...@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size); ...@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
* *
*/ */
template <typename Place> template <typename Place>
void Free(Place place, void* ptr); void Free(Place place, void* ptr, bool is_pinned = false);
/** /**
* \brief Total size of used memory in one place. * \brief Total size of used memory in one place.
...@@ -74,11 +74,13 @@ class PODDeleter { ...@@ -74,11 +74,13 @@ class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
public: public:
explicit PODDeleter(Place place) : place_(place) {} explicit PODDeleter(Place place, bool is_pinned = false)
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); } : place_(place), is_pinned_(is_pinned) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
private: private:
Place place_; Place place_;
bool is_pinned_;
}; };
/** /**
......
...@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
EXPECT_EQ(total_size, 0UL); EXPECT_EQ(total_size, 0UL);
for (auto size : for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size; ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk // Buddy Allocator doesn't manage too large memory chunk
...@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { ...@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
EXPECT_EQ(total_size, 0UL); EXPECT_EQ(total_size, 0UL);
for (auto size : for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(gpu, size)] = size; ps[paddle::memory::Alloc(gpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk // Buddy Allocator doesn't manage too large memory chunk
......
...@@ -153,8 +153,13 @@ function(op_library TARGET) ...@@ -153,8 +153,13 @@ function(op_library TARGET)
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif() endif()
endif()
# pybind USE_OP # pybind USE_OP
if (${pybind_flag} EQUAL 0) if (${pybind_flag} EQUAL 0)
...@@ -178,13 +183,19 @@ if(WITH_DISTRIBUTE) ...@@ -178,13 +183,19 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(send_op DEPS ${DISTRIBUTE_DEPS}) op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
endif() endif()
op_library(cond_op DEPS framework_proto tensor net_op) op_library(cond_op DEPS framework_proto tensor net_op)
...@@ -255,3 +266,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor ...@@ -255,3 +266,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "mkldnn_activation_op.h"
#include "paddle/fluid/operators/activation_op.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
namespace {
template <typename T, typename ExecContext>
void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T alpha = 0, const T beta = 0) {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers
const auto *src = ctx.template Input<Tensor>("X");
const auto *src_data = src->template data<T>();
auto *dst = ctx.template Output<Tensor>("Out");
const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
// get memory dim
PADDLE_ENFORCE(src->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW");
std::vector<int> src_tz = framework::vectorize2int(src->dims());
// create memory description
// TODO(kbinias-intel): support more formats
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
auto forward_desc = mkldnn::eltwise_forward::desc(
mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
// save prim desc into global device context to be referred in backward path
const std::string key = ctx.op().Output("Out");
const std::string key_eltwise_pd = key + "@eltwise_pd";
auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
forward_desc, mkldnn_engine);
dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {eltwise};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
template <typename T, typename ExecContext>
void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T alpha = 0, const T beta = 0) {
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers
const auto *x = ctx.template Input<Tensor>("X");
const auto *src = x->template data<T>();
auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
const auto *diff_dst = dout->template data<T>();
auto *dx =
ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
// get memory dim
std::vector<int> src_tz = framework::vectorize2int(x->dims());
// create memory description
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
auto diff_src_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
auto diff_dst_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
auto backward_desc =
mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
// retrieve eltwise primitive desc from device context
const std::string key = ctx.op().Input("Out");
const std::string key_eltwise_pd = key + "@eltwise_pd";
const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
PADDLE_ENFORCE(forward_pd != nullptr,
"Fail to find eltwise_pd in device context");
auto *p_forward_pd =
static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
backward_desc, mkldnn_engine, *p_forward_pd);
auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
diff_dst_memory, diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // anonymous namespace
template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
template <typename ExecContext>
void operator()(const ExecContext &ctx) const {
eltwise_forward<T>(ctx, algorithm);
}
};
template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
template <typename ExecContext>
void operator()(const ExecContext &ctx) const {
eltwise_grad<T>(ctx, algorithm);
}
};
template <typename T>
using ReluMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
template <typename T>
using TanhMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
template <typename T>
using SqrtMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
template <typename T>
using AbsMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
template <typename T>
using ReluMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
template <typename T>
using TanhMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
template <typename T>
using SqrtMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
template <typename T>
using AbsMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>); \
REGISTER_OP_KERNEL( \
act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
__macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
__macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
__macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
__macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Relu operator"); AddInput("X", "Input of Relu operator");
AddOutput("Out", "Output of Relu operator"); AddOutput("Out", "Output of Relu operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Relu Activation Operator. Relu Activation Operator.
...@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Tanh operator"); AddInput("X", "Input of Tanh operator");
AddOutput("Out", "Output of Tanh operator"); AddOutput("Out", "Output of Tanh operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Tanh Activation Operator. Tanh Activation Operator.
...@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Sqrt operator"); AddInput("X", "Input of Sqrt operator");
AddOutput("Out", "Output of Sqrt operator"); AddOutput("Out", "Output of Sqrt operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Sqrt Activation Operator. Sqrt Activation Operator.
...@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Abs operator"); AddInput("X", "Input of Abs operator");
AddOutput("Out", "Output of Abs operator"); AddOutput("Out", "Output of Abs operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Abs Activation Operator. Abs Activation Operator.
...@@ -247,6 +260,36 @@ $out = floor(x)$ ...@@ -247,6 +260,36 @@ $out = floor(x)$
} }
}; };
class CosOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CosOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Cosine operator");
AddOutput("Out", "Output of Cosine operator");
AddComment(R"DOC(
Cosine Activation Operator.
$out = cos(x)$
)DOC");
}
};
class SinOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Sine operator");
AddOutput("Out", "Output of Sine operator");
AddComment(R"DOC(
Sine Activation Operator.
$out = sin(x)$
)DOC");
}
};
class RoundOpMaker : public framework::OpProtoAndCheckerMaker { class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker) RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
...@@ -524,11 +567,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, ...@@ -524,11 +567,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
tanh_shrink_grad, ops::ActivationOpGrad); tanh_shrink_grad, ops::ActivationOpGrad);
...@@ -536,11 +579,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, ...@@ -536,11 +579,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
softshrink_grad, ops::ActivationOpGrad); softshrink_grad, ops::ActivationOpGrad);
REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
...@@ -548,6 +591,12 @@ REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, ...@@ -548,6 +591,12 @@ REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad, REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad,
ops::ActivationOpGrad);
REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad,
ops::ActivationOpGrad);
REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad, REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -17,6 +17,10 @@ limitations under the License. */ ...@@ -17,6 +17,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -327,6 +331,54 @@ struct FloorFunctor : public BaseActivationFunctor<T> { ...@@ -327,6 +331,54 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct Sine {
HOSTDEVICE T operator()(const T& val) const { return sin(val); }
};
template <typename T>
struct Cosine {
HOSTDEVICE T operator()(const T& val) const { return cos(val); }
};
// cosine'(x) = -sin(x)
template <typename T>
struct CosGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = -dout * x.unaryExpr(Sine<T>());
}
};
// cosine(x) = cos(x)
template <typename T>
struct CosFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Cosine<T>());
}
};
// sine'(x) = cos(x)
template <typename T>
struct SinGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * x.unaryExpr(Cosine<T>());
}
};
// sine(x) = sin(x)
template <typename T>
struct SinFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Sine<T>());
}
};
// round(x) = [x] // round(x) = [x]
template <typename T> template <typename T>
struct RoundFunctor : public BaseActivationFunctor<T> { struct RoundFunctor : public BaseActivationFunctor<T> {
...@@ -778,6 +830,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -778,6 +830,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \
__macro(cos, CosFunctor, CosGradFunctor); \
__macro(sin, SinFunctor, SinGradFunctor); \
__macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \
......
...@@ -23,21 +23,10 @@ limitations under the License. */ ...@@ -23,21 +23,10 @@ limitations under the License. */
static constexpr char Channel[] = "Channel"; static constexpr char Channel[] = "Channel";
static constexpr char X[] = "X"; static constexpr char X[] = "X";
static constexpr char Status[] = "Status";
static constexpr char copy[] = "copy";
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void SetSendStatus(const platform::Place &dev_place,
framework::Variable &status_var, bool status) {
auto cpu = platform::CPUPlace();
auto status_tensor =
status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
cpu);
status_tensor[0] = status;
}
class ChannelSendOp : public framework::OperatorBase { class ChannelSendOp : public framework::OperatorBase {
public: public:
ChannelSendOp(const std::string &type, ChannelSendOp(const std::string &type,
...@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
"Input(Channel) of ChannelSendOp should not be null."); "Input(Channel) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(X), PADDLE_ENFORCE(ctx->HasInput(X),
"Input(X) of ChannelSendOp should not be null."); "Input(X) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Status),
"Output(Status) of ChannelSendOp should not be null.");
ctx->SetOutputDim("Status", {1});
} }
private: private:
...@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase {
auto input_var = scope.FindVar(Input(X)); auto input_var = scope.FindVar(Input(X));
// Send the input data through the channel. // Send the input data through the channel.
bool ok = concurrency::ChannelSend(ch, input_var); concurrency::ChannelSend(ch, input_var);
// Set the status output of the `ChannelSend` call.
SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
} }
}; };
...@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
.AsDuplicable(); .AsDuplicable();
AddInput(X, "(Variable) The value which gets sent by the channel.") AddInput(X, "(Variable) The value which gets sent by the channel.")
.AsDuplicable(); .AsDuplicable();
AddOutput(Status,
"(Tensor) An LoD Tensor that returns a boolean status of the"
"result of the send operation.")
.AsDuplicable();
AddAttr<bool>(copy, "(bool, default false) Should copy before send")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
)DOC"); )DOC");
} }
......
...@@ -29,6 +29,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -29,6 +29,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Y", string::Sprintf( AddInput("Y", string::Sprintf(
"(LoDTensor) the right hand operand of %s operator", "(LoDTensor) the right hand operand of %s operator",
comment.type)); comment.type));
AddAttr<bool>("force_cpu",
"(bool, default false) Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running "
"device")
.SetDefault(false);
AddOutput("Out", string::Sprintf( AddOutput("Out", string::Sprintf(
"(LoDTensor) n-dim bool tensor. Each element is %s", "(LoDTensor) n-dim bool tensor. Each element is %s",
comment.equation)); comment.equation));
...@@ -75,7 +80,9 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -75,7 +80,9 @@ class CompareOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place // CompareOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place(); bool force_cpu = ctx.Attr<bool>("force_cpu");
kt.place_ = force_cpu ? platform::CPUPlace()
: ctx.Input<framework::LoDTensor>("X")->place();
return kt; return kt;
} }
}; };
......
...@@ -17,20 +17,20 @@ limitations under the License. */ ...@@ -17,20 +17,20 @@ limitations under the License. */
namespace poc = paddle::operators::concurrency; namespace poc = paddle::operators::concurrency;
bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type()); auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Send(var->GetMutable<framework::LoDTensor>()); ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Send(var->GetMutable<framework::LoDRankTable>()); ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Send(var->GetMutable<framework::LoDTensorArray>()); ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS) else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Send(var->GetMutable<framework::SelectedRows>()); ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER) else if (type == framework::proto::VarType_Type_READER)
return ch->Send(var->GetMutable<framework::ReaderHolder>()); ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL) else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Send(var->GetMutable<framework::ChannelHolder>()); ch->Send(var->GetMutable<framework::ChannelHolder>());
else else
PADDLE_THROW("ChannelSend:Unsupported type"); PADDLE_THROW("ChannelSend:Unsupported type");
} }
......
...@@ -21,7 +21,7 @@ namespace paddle { ...@@ -21,7 +21,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace concurrency { namespace concurrency {
bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
......
...@@ -54,7 +54,18 @@ class ConditionalOp : public framework::OperatorBase { ...@@ -54,7 +54,18 @@ class ConditionalOp : public framework::OperatorBase {
"numel should be 1, actual numel is %d", "numel should be 1, actual numel is %d",
ips[0]->numel()); ips[0]->numel());
} }
return ips[0]->data<bool>()[0]; bool res = false;
if (platform::is_gpu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_CUDA
framework::LoDTensor cpu_tensor;
framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
res = cpu_tensor.data<bool>()[0];
#endif
} else {
res = ips[0]->data<bool>()[0];
}
return res;
} }
}; };
......
...@@ -2,7 +2,7 @@ if(WITH_DISTRIBUTE) ...@@ -2,7 +2,7 @@ if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc) cares zlib protobuf sendrecvop_grpc)
endif() endif()
...@@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = NULL; s->response_call_back_ = NULL;
auto call = std::move(s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
&cq_));
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s); call->Finish(&s->reply_, &s->status_, (void*)s);
}); });
...@@ -89,10 +88,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -89,10 +88,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
// prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
// varhandle // var handle
VarHandle var_h; VarHandle var_h;
var_h.ep = ep_val; var_h.ep = ep_val;
var_h.scope = p_scope; var_h.scope = p_scope;
...@@ -104,11 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -104,11 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
::grpc::ByteBuffer buf; auto call = s->stub_g_.PrepareUnaryCall(
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
auto call = std::move(s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_));
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s); call->Finish(&s->reply_, &s->status_, (void*)s);
}); });
...@@ -118,6 +117,48 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -118,6 +117,48 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
return true; return true;
} }
bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string in_var_name_val = in_var_name;
const std::string out_var_name_val = out_var_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
time_out, ch, this] {
auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
// var handle
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = out_var_name_val;
var_h.ctx = p_ctx;
// stub context
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse;
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s);
});
req_count_++;
return true;
}
void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
...@@ -205,7 +246,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) { ...@@ -205,7 +246,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
} }
grpc::ChannelArguments args; grpc::ChannelArguments args;
args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000);
args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
args.SetMaxSendMessageSize(std::numeric_limits<int>::max()); args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max()); args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
......
...@@ -172,6 +172,13 @@ class RPCClient { ...@@ -172,6 +172,13 @@ class RPCClient {
const std::string& var_name, const std::string& var_name,
int64_t time_out = 600 * 1000); int64_t time_out = 600 * 1000);
bool AsyncPrefetchVariable(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
int64_t time_out = 600 * 1000);
void AsyncSendBatchBarrier(const std::string& ep, void AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out = 600 * 1000); int64_t time_out = 600 * 1000);
......
...@@ -174,13 +174,13 @@ void AsyncGRPCServer::ShutdownQueue() { ...@@ -174,13 +174,13 @@ void AsyncGRPCServer::ShutdownQueue() {
std::unique_lock<std::mutex> lock(cq_mutex_); std::unique_lock<std::mutex> lock(cq_mutex_);
cq_send_->Shutdown(); cq_send_->Shutdown();
cq_get_->Shutdown(); cq_get_->Shutdown();
is_shut_down_ = true;
} }
// This URL explains why shutdown is complicate: // This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() { void AsyncGRPCServer::ShutDown() {
server_->Shutdown(); is_shut_down_ = true;
ShutdownQueue(); ShutdownQueue();
server_->Shutdown();
} }
void AsyncGRPCServer::TryToRegisterNewSendOne() { void AsyncGRPCServer::TryToRegisterNewSendOne() {
...@@ -213,14 +213,14 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, ...@@ -213,14 +213,14 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
bool ok = false; bool ok = false;
while (true) { while (true) {
if (!cq->Next(&tag, &ok)) { if (!cq->Next(&tag, &ok)) {
LOG(INFO) << cq_name << " get CompletionQueue shutdown!"; LOG(INFO) << cq_name << " CompletionQueue shutdown!";
break; break;
} }
PADDLE_ENFORCE(tag); PADDLE_ENFORCE(tag);
// FIXME(typhoonzero): de-couple the barriers with recv_op // FIXME(typhoonzero): de-couple the barriers with recv_op
if (cq_name == "cq_get") WaitCond(1); if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
if (cq_name == "cq_send") WaitCond(0); if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
RequestBase* base = (RequestBase*)tag; RequestBase* base = (RequestBase*)tag;
// reference: // reference:
......
...@@ -59,12 +59,12 @@ message VariableMessage { ...@@ -59,12 +59,12 @@ message VariableMessage {
// lod details: // lod details:
int64 lod_level = 5; int64 lod_level = 5;
repeated LodData lod = 6; repeated LodData lod = 6;
// selected_rows height, aka. original dim0
int64 slr_height = 7;
// tensor data // tensor data
bytes serialized = 7; bytes serialized = 8;
// selected_rows data // selected_rows data
bytes rows = 8; bytes rows = 9;
} }
message VoidMessage {} message VoidMessage {}
message TestMessage { int64 test_1 = 1; }
...@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteUint64(VarMsg::kDimsFieldNumber, dim); e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
} }
e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0); e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
e.WriteUint64(VarMsg::kSlrHeightFieldNumber, slr->height());
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
ProtoEncodeHelper e2((char*)buf, 128); ProtoEncodeHelper e2((char*)buf, 128);
// NOTE: rows is of type int64_t // NOTE: rows is of type int64_t
size_t rows_memory_size = size_t rows_memory_size =
slr->rows().capacity() * framework::SizeOfType(typeid(int64_t)); slr->rows().size() * framework::SizeOfType(typeid(int64_t));
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size()); slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size()); memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <sys/time.h>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -35,6 +36,12 @@ namespace detail { ...@@ -35,6 +36,12 @@ namespace detail {
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
static int64_t GetTimestamp() {
struct timeval tp;
gettimeofday(&tp, NULL);
return tp.tv_sec * 1000 + tp.tv_usec / 1000;
}
typedef void (*DestroyCallback)(void*); typedef void (*DestroyCallback)(void*);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
......
...@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// serialize var to ByteBuffer // serialize var to ByteBuffer
framework::Variable var; framework::Variable var;
auto* slr = var.GetMutable<framework::SelectedRows>(); auto* slr = var.GetMutable<framework::SelectedRows>();
slr->set_height(1000);
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows(); auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({2, 10})); tensor->Resize(framework::make_ddim({564, 128}));
tensor->mutable_data<float>(place); tensor->mutable_data<float>(place);
int tensor_numel = 2 * 10; int tensor_numel = 564 * 128;
math::set_constant(ctx, tensor, 32.7); math::set_constant(ctx, tensor, 32.7);
rows->push_back(3); for (int i = 0; i < 564; ++i) rows->push_back(i);
rows->push_back(10);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
...@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
sendrecv::VariableMessage varmsg; sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp)); EXPECT_TRUE(varmsg.ParseFromString(tmp));
// deserialize bytebuffer
EXPECT_EQ(varmsg.varname(), "myvar"); EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 1); EXPECT_EQ(varmsg.type(), 1);
...@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < tensor_numel; ++i) { for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 32.7); EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
} }
EXPECT_EQ(rows_data[0], 3); for (int i = 0; i < 564; ++i) {
EXPECT_EQ(rows_data[1], 10); EXPECT_EQ(rows_data[i], i);
}
// deserialize zero-copy // deserialize zero-copy
// framework::Variable var2; // framework::Variable var2;
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
...@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < tensor_numel; ++i) { for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
} }
EXPECT_EQ(rows_data2[0], 3); for (int i = 0; i < rows2->size(); ++i) {
EXPECT_EQ(rows_data2[1], 10); EXPECT_EQ(rows_data2[i], i);
}
EXPECT_EQ(slr2->height(), 1000);
} }
void RunTestLodTensor(platform::Place place, int from_type = 0) { void RunTestLodTensor(platform::Place place, int from_type = 0) {
......
...@@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
void* dest, int size) { void* dest, int size) {
const void* data = NULL; const void* data = NULL;
int size_to_write = 0; int size_to_write = 0;
int length = size;
int total_written = 0;
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
platform::CPUPlace cpu; platform::CPUPlace cpu;
char* p = reinterpret_cast<char*>(dest); char* p = reinterpret_cast<char*>(dest);
while (size > 0) { while (total_written < length) {
if (!input->GetDirectBufferPointer(&data, &size_to_write)) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
return false; return false;
} }
// NOTE: if raw buffer is large and have two neighbor fields of raw
// buffers GetDirectBufferPointer can get all of them, use length to
// truncate it.
if (total_written + size_to_write > length) {
size_to_write = length - total_written;
}
memory::Copy(boost::get<platform::CUDAPlace>(place), memory::Copy(boost::get<platform::CUDAPlace>(place),
reinterpret_cast<void*>(p), cpu, data, size_to_write, reinterpret_cast<void*>(p), cpu, data, size_to_write,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
p += size_to_write; p += size_to_write;
size -= size_to_write; total_written += size_to_write;
input->Skip(size_to_write); input->Skip(size_to_write);
} }
...@@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
} }
char* p = reinterpret_cast<char*>(dest); char* p = reinterpret_cast<char*>(dest);
while (size > 0) { while (total_written < length) {
if (!input->GetDirectBufferPointer(&data, &size_to_write)) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
return false; return false;
} }
// NOTE: if raw buffer is large and have two neighbor fields of raw buffers
// GetDirectBufferPointer can get all of them, use length to truncate it.
if (total_written + size_to_write > length) {
size_to_write = length - total_written;
}
// TODO(gongwb): can we avoid copy? // TODO(gongwb): can we avoid copy?
platform::CPUPlace cpu; platform::CPUPlace cpu;
memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write); memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
p += size_to_write; p += size_to_write;
size -= size_to_write; total_written += size_to_write;
input->Skip(size_to_write); input->Skip(size_to_write);
} }
...@@ -135,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData( ...@@ -135,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
const platform::DeviceContext& ctx, framework::DDim& dims, int length) { const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
auto var = scope_->FindVar(meta_.varname()); auto var = scope_->FindVar(meta_.varname());
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
slr->set_height(meta_.slr_height());
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
tensor->Resize(dims); tensor->Resize(dims);
PADDLE_ENFORCE_EQ(
tensor->numel(),
length / framework::SizeOfType(
paddle::operators::detail::ToTypeIndex(meta_.data_type())));
void* tensor_data = tensor->mutable_data( void* tensor_data = tensor->mutable_data(
ctx.GetPlace(), ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta_.data_type())); paddle::operators::detail::ToTypeIndex(meta_.data_type()));
...@@ -153,6 +170,8 @@ bool VariableResponse::CopySelectRowsData( ...@@ -153,6 +170,8 @@ bool VariableResponse::CopySelectRowsData(
const platform::DeviceContext& ctx, int length) { const platform::DeviceContext& ctx, int length) {
auto var = scope_->FindVar(meta_.varname()); auto var = scope_->FindVar(meta_.varname());
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
slr->mutable_rows()->resize(length /
framework::SizeOfType(typeid(int64_t))); // int64
int64_t* rows_data = slr->mutable_rows()->data(); int64_t* rows_data = slr->mutable_rows()->data();
// copy rows CPU data, GPU data will be copied lazily. // copy rows CPU data, GPU data will be copied lazily.
...@@ -233,7 +252,6 @@ int VariableResponse::Parse(Source* source) { ...@@ -233,7 +252,6 @@ int VariableResponse::Parse(Source* source) {
if (tag != 0) { if (tag != 0) {
return -1; return -1;
} }
return 0; return 0;
} }
...@@ -336,6 +354,14 @@ int VariableResponse::Parse(Source* source) { ...@@ -336,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
} }
break; break;
} }
case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_slr_height(static_cast<int64_t>(v));
break;
}
case sendrecv::VariableMessage::kSerializedFieldNumber: { case sendrecv::VariableMessage::kSerializedFieldNumber: {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR) && meta_.type() == sendrecv::LOD_TENSOR) &&
......
...@@ -55,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -55,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
y->mutable_data<T>(context.GetPlace()); y->mutable_data<T>(context.GetPlace());
float dropout_prob = context.Attr<float>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto& place = *context.template device_context<Place>().eigen_device(); auto& place = *context.template device_context<Place>().eigen_device();
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
...@@ -76,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -76,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>( T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, seed, dropout_prob, x_data, mask_data, y_data); size, seed, dropout_prob, x_data, mask_data, y_data);
} else { } else {
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob); Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
} }
} }
......
...@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <random> #include <random>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(dropout);
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto var = scope->Var("X");
auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize({10, 10});
std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(1.0);
}
TensorFromVector(init, ctx, tensor);
auto place = ctx.GetPlace();
auto out_var = scope->Var("Out");
auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate
auto mask_var = scope->Var("Mask");
auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
mask_tensor->Resize({10, 10});
mask_tensor->mutable_data<float>(place); // allocate
// run
f::AttributeMap attrs;
float dropout_prob = 0.5;
attrs.insert({"fix_seed", 1});
attrs.insert({"seed", 3});
attrs.insert({"dropout_prob", dropout_prob});
auto dropout_op = f::OpRegistry::CreateOp(
"dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
dropout_op->Run(*scope, place);
std::vector<float> out_vec;
TensorToVector(*out_tensor, ctx, &out_vec);
std::vector<float> std_out = {
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
EXPECT_EQ(out_vec.size(), std_out.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], std_out[i]);
}
}
// TODO(wyi): Due to
// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
// disable this test to remove the prevention of the merge of
// unrelated PRs.
/*
TEST(Dropout, CPUDense) {
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
Compare(scope, ctx);
}
TEST(Dropout, GPUDense) {
f::Scope scope;
p::CUDAPlace place;
p::CUDADeviceContext ctx(place);
Compare(scope, ctx);
}
*/
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
You may obtain a copy of the License at // You may obtain a copy of the License at
//
http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
//
Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
limitations under the License. */ // limitations under the License.
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/increment_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class IncrementInferShape : public framework::InferShapeBase { class IncrementOp : public framework::OperatorWithKernel {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of IncrementOp should not be null."); "Input(X) of IncrementOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of IncrementOp should not be null."); "Output(Out) of IncrementOp should not be null.");
PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X"))); PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", "Out");
} }
};
struct IncrementFunctor {
IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
float value)
: x_(x), out_(out), value_(value) {}
template <typename T>
void operator()() const {
*out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
}
const framework::LoDTensor &x_;
framework::LoDTensor *out_;
float value_;
};
class IncrementOp : public framework::OperatorBase {
public:
IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE(platform::is_cpu_place(x.place())); protected:
out.Resize(x.dims()); framework::OpKernelType GetExpectedKernelType(
out.mutable_data(x.place(), x.type()); const framework::ExecutionContext &ctx) const override {
float value = Attr<float>("step"); framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
VLOG(10) << Output("Out") << " increase " << Input("X") << " with " // IncrementOp kernel's device type is decided by input tensor place
<< value; kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
framework::VisitDataType(framework::ToDataType(out.type()), return kt;
IncrementFunctor(x, &out, value));
} }
}; };
...@@ -108,5 +83,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { ...@@ -108,5 +83,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape, REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
ops::IncrementOpMaker, ops::IncrementGradOpMaker); ops::IncrementGradOpMaker);
REGISTER_OP_CPU_KERNEL(
increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/increment_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class IncrementKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x_tensor = context.Input<framework::Tensor>("X");
auto* out_tensor = context.Output<framework::Tensor>("Out");
float step = context.Attr<float>("step");
out_tensor->mutable_data<T>(context.GetPlace());
auto& dev =
*context.template device_context<DeviceContext>().eigen_device();
framework::EigenScalar<T>::From(*out_tensor).device(dev) =
framework::EigenScalar<T>::From(*x_tensor) + static_cast<T>(step);
}
};
} // namespace operators
} // namespace paddle
...@@ -22,6 +22,103 @@ limitations under the License. */ ...@@ -22,6 +22,103 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// Wrap RowwiseMean and ColwiseMean.
// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
// implementation only considers 2D.
template <typename DeviceContext, typename T>
struct RowwiseMean2D {
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class RowwiseMean2D<platform::CUDADeviceContext, T> {
public:
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
: left_(left), right_(right) {
framework::DDim ones_dim({right_});
divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
math::set_constant(dev_ctx, &divisor_, 1.0 / right);
}
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>(
context, false, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
0., out->data<T>());
}
private:
int left_;
int right_;
framework::Tensor divisor_;
};
#endif
template <typename T>
class RowwiseMean2D<platform::CPUDeviceContext, T> {
public:
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
row_mean_(context, input, out);
}
private:
math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
};
template <typename DeviceContext, typename T>
struct ColwiseSum2D {
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class ColwiseSum2D<platform::CUDADeviceContext, T> {
public:
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
: left_(left), right_(right) {
framework::DDim ones_dim({left_});
divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
math::set_constant(dev_ctx, &divisor_, 1.0);
}
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>(
context, true, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
0., out->data<T>());
}
private:
int left_;
int right_;
framework::Tensor divisor_;
};
#endif
template <typename T>
class ColwiseSum2D<platform::CPUDeviceContext, T> {
public:
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
col_wise_(context, input, out);
}
private:
math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
};
template <typename T> template <typename T>
struct SubAndSquareFunctor { struct SubAndSquareFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
...@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout; ...@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> { class LayerNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean"); auto* mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance"); auto* var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x.dims(); const auto x_dims = x.dims();
...@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
out.ShareDataWith(*y); out.ShareDataWith(*y);
out.Resize(matrix_shape); out.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::RowwiseMean<DeviceContext, T> row_mean; RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
// get mean // get mean
row_mean(dev_ctx, x, mean); row_mean(dev_ctx, x, mean);
...@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> { class LayerNormGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto *mean = ctx.Input<Tensor>("Mean"); auto* mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance"); auto* var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y")); auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
// init output // init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X")); auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale")); auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias")); auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const auto &x_dims = x.dims(); const auto& x_dims = x.dims();
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]); int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]); int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right}); framework::DDim matrix_shape({left, right});
d_y.Resize(matrix_shape); d_y.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::ColwiseSum<DeviceContext, T> colwise_sum; ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
ctx.device_context());
Tensor temp; Tensor temp;
Tensor temp_norm; Tensor temp_norm;
...@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> { ...@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
Tensor temp_vec; Tensor temp_vec;
temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace()); temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
math::RowwiseMean<DeviceContext, T> row_mean; RowwiseMean2D<DeviceContext, T> row_mean(left, right,
ctx.device_context());
if (d_scale) { if (d_scale) {
// dy_dx // dy_dx
......
...@@ -54,6 +54,24 @@ static void CreateTensorFromMessageType(framework::Variable *var, ...@@ -54,6 +54,24 @@ static void CreateTensorFromMessageType(framework::Variable *var,
} }
} }
static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
framework::Executor *executor,
framework::ProgramDesc *program,
framework::Scope *scope) {
std::vector<std::future<void>> fs;
for (size_t idx : parallel_blkids) {
fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
int run_block = idx; // thread local
try {
executor->Run(*program, scope, run_block, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
}));
}
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
}
class ListenAndServOp : public framework::OperatorBase { class ListenAndServOp : public framework::OperatorBase {
public: public:
ListenAndServOp(const std::string &type, ListenAndServOp(const std::string &type,
...@@ -70,7 +88,6 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -70,7 +88,6 @@ class ListenAndServOp : public framework::OperatorBase {
void Stop() override { void Stop() override {
rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
rpc_service_->ShutDown();
server_thread_->join(); server_thread_->join();
} }
...@@ -139,36 +156,28 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -139,36 +156,28 @@ class ListenAndServOp : public framework::OperatorBase {
break; break;
} }
// put optimize blocks in the thread pool to start run, the last block
// should be global ops.
// NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
// and this will still work. // and this will still work.
std::vector<std::future<void>> fs; // The optimize blocks which have the same parent ID would run parallel
// block0 contains only listen_and_serv op, start run from block1. // TODO(Yancey1989): need to use ParallelExecutor for future
for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { size_t last_parent_blkid = program->Block(1).Parent();
fs.push_back(framework::Async( std::vector<size_t> parallel_blkids;
[&executor, &program, &recv_scope, &prepared, blkid]() { parallel_blkids.push_back(1);
int run_block = blkid; // thread local double ts = detail::GetTimestamp();
try { for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
executor.RunPreparedContext(prepared[run_block].get(), if (program->Block(blkid).Parent() != last_parent_blkid) {
&recv_scope, false, false); for (size_t idx : parallel_blkids) VLOG(3) << idx;
} catch (std::exception &e) { ParallelExecuteBlocks(parallel_blkids, &executor, program,
LOG(ERROR) << "run sub program error " << e.what(); &recv_scope);
} parallel_blkids.clear();
})); last_parent_blkid = program->Block(blkid).Parent();
} }
for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait(); parallel_blkids.push_back(blkid);
// Run global block at final step, or block1 if there are only 2 blocks }
if (num_blocks >= 2) { ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
try {
// executor.Run(program, &recv_scope, num_blocks - 1, false, false); VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
executor.RunPreparedContext(prepared[num_blocks - 1].get(),
&recv_scope, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
}
// Reset the received sparse variables, the sum operator would not // Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next // sum the input sparse variables which rows is empty at the next
......
...@@ -36,6 +36,14 @@ std::shared_ptr<T> insert_to_context(const std::string& key, ...@@ -36,6 +36,14 @@ std::shared_ptr<T> insert_to_context(const std::string& key,
return p; return p;
} }
template <typename... Args>
void run_primitive(Args&&... args) {
auto forward_op = mkldnn::lrn_forward{args...};
std::vector<mkldnn::primitive> pipeline = {forward_op};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // namespace } // namespace
template <typename T> template <typename T>
...@@ -87,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -87,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
static_cast<void*>(output_data)}; static_cast<void*>(output_data)};
std::unique_ptr<mkldnn::lrn_forward> forward_op = nullptr;
if (!is_test) { if (!is_test) {
const std::string key = ctx.op().Output("Out"); const std::string key = ctx.op().Output("Out");
const std::string key_src_memory = key + "@lrn_src_memory"; const std::string key_src_memory = key + "@lrn_src_memory";
...@@ -108,9 +114,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -108,9 +114,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
key_workspace_memory, dev_ctx, key_workspace_memory, dev_ctx,
forward_pd->workspace_primitive_desc()); forward_pd->workspace_primitive_desc());
forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory, run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
*workspace_memory, dst_memory});
} else { } else {
auto forward_pd = auto forward_pd =
mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
...@@ -119,12 +123,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -119,12 +123,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto workspace_memory = auto workspace_memory =
mkldnn::memory{forward_pd.workspace_primitive_desc()}; mkldnn::memory{forward_pd.workspace_primitive_desc()};
forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory, run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
workspace_memory, dst_memory});
} }
std::vector<mkldnn::primitive> pipeline = {*forward_op};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
} }
}; };
...@@ -136,6 +136,9 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -136,6 +136,9 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
"MKLDNN LRN must use float data."); "MKLDNN LRN must use float data.");
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"MKLDNN LRN must use CPUPlace."); "MKLDNN LRN must use CPUPlace.");
PADDLE_ENFORCE(
!ctx.Attr<bool>("is_test"),
"is_test attribute should be set to False in training phase.");
auto x = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
......
...@@ -155,8 +155,8 @@ class LRNOp : public framework::OperatorWithKernel { ...@@ -155,8 +155,8 @@ class LRNOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
ctx->SetOutputDim("Out", x_dim); ctx->SetOutputDim("Out", x_dim);
ctx->SetOutputDim("MidOut", x_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("MidOut", x_dim);
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -214,7 +214,10 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -214,7 +214,10 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
"Defaults to \"NHWC\". Specify the data format of the output data, " "Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ") "the input will be transformed automatically. ")
.SetDefault("AnyLayout"); .SetDefault("AnyLayout");
AddAttr<bool>("is_test", "").SetDefault(false); AddAttr<bool>("is_test",
"Turns on memory optimization that optimizes away "
"unnecessary memory allocations. Used by MKLDNN.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Local Response Normalization Operator. Local Response Normalization Operator.
......
...@@ -121,6 +121,10 @@ class LRNGradKernel : public framework::OpKernel<T> { ...@@ -121,6 +121,10 @@ class LRNGradKernel : public framework::OpKernel<T> {
T alpha = ctx.Attr<T>("alpha"); T alpha = ctx.Attr<T>("alpha");
T beta = ctx.Attr<T>("beta"); T beta = ctx.Attr<T>("beta");
PADDLE_ENFORCE(
!ctx.Attr<bool>("is_test"),
"is_test attribute should be set to False in training phase.");
LRNGradFunctor<DeviceContext, T> f; LRNGradFunctor<DeviceContext, T> f;
f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
} }
......
...@@ -20,7 +20,7 @@ namespace math { ...@@ -20,7 +20,7 @@ namespace math {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CPUDeviceContext, T> { class ConcatFunctor<platform::CPUDeviceContext, T> {
...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> { ...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> { class ConcatGradFunctor<platform::CPUDeviceContext, T> {
......
...@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, ...@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
} }
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs, const int input_col, __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
const int output_rows, const int output_cols, const int out_rows, const int out_cols,
T* output) { T* output_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col; for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { int split = tid_x * 1.0 / fixed_in_col;
int split = tid_x * inv_input_col; int in_offset = tid_x - split * fixed_in_col;
int in_offset = tid_x - split * input_col; T* input_ptr = inputs_data[split];
T* input_ptr = inputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
output[tid_y * output_cols + tid_x] = output_data[tid_y * out_cols + tid_x] =
input_ptr[tid_y * input_col + in_offset]; input_ptr[tid_y * fixed_in_col + in_offset];
} }
} }
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row, __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int input_col, const int* output_cols, const int in_col, const int* out_cols,
int col_size, T** outputs) { int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1; int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
int curr_offset = output_cols[segment]; int curr_offset = out_cols[segment];
int curr_segment = segment; int curr_segment = segment;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; T curr_col_offset;
while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs[curr_segment]; T* output_ptr = outputs_data[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] = output_ptr[tid_y * segment_width + local_col] =
input[tid_y * input_col + tid_x]; input_data[tid_y * in_col + tid_x];
} }
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row, __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int input_col, const int output_cols, const int in_col, const int fixed_out_col,
T** outputs) { T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col; for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { int split = tid_x / fixed_out_col;
int split = tid_x * inv_input_col; int in_offset = tid_x - split * fixed_out_col;
int in_offset = tid_x - split * input_col; T* output_ptr = outputs_data[split];
T* output_ptr = outputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * output_cols + in_offset] = output_ptr[tid_y * fixed_out_col + in_offset] =
input[tid_y * input_col + tid_x]; input_data[tid_y * in_col + tid_x];
} }
} }
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> { class ConcatFunctor<platform::CUDADeviceContext, T> {
...@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
const std::vector<framework::Tensor>& input, const int axis, const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) { framework::Tensor* output) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = input.size(); int in_num = input.size();
int rows = 1; int in_row = 1;
auto dim_0 = input[0].dims(); auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
rows *= dim_0[i]; in_row *= dim_0[i];
} }
int cols = input[0].numel() / rows; int in_col = input[0].numel() / in_row;
int out_rows = rows, out_cols = 0; int out_row = in_row, out_col = 0;
framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2); framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
framework::Vector<int> inputs_cols(num + 1); framework::Vector<int> inputs_col(in_num + 1);
inputs_cols[0] = 0;
T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data()); T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
inputs_col[0] = 0;
bool sameShape = true; bool sameShape = true;
for (int i = 0; i < num; ++i) { for (int i = 0; i < in_num; ++i) {
int t_cols = input[i].numel() / rows; int t_cols = input[i].numel() / in_row;
if (sameShape) { if (sameShape) {
if (t_cols != cols) sameShape = false; if (t_cols != in_col) sameShape = false;
} }
out_cols += t_cols; out_col += t_cols;
inputs_cols[i + 1] = out_cols; inputs_col[i + 1] = out_col;
inputs_ptr[i] = const_cast<T*>(input[i].data<T>()); inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
} }
T** ins_gpu = T** dev_ins_data =
reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace())); reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
// computation // computation
// set the thread block and grid according to CurrentDeviceId // set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. if (out_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((out_cols + 31) >> 5) << 5; block_cols = ((out_col + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols; int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1); dim3 block_size = dim3(block_cols, block_rows, 1);
...@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols = int grid_cols =
std::min((out_cols + block_cols - 1) / block_cols, max_blocks); std::min((out_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows = int grid_rows =
std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, cols, out_rows, out_cols, output->data<T>()); dev_ins_data, in_col, out_row, out_col, output->data<T>());
} else { } else {
const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows, dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_cols, output->data<T>()); out_row, out_col, output->data<T>());
} }
} }
}; };
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class ConcatGradFunctor<platform::CUDADeviceContext, T> {
...@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs.size(); int o_num = outputs.size();
int input_row = 1; int out_row = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_row *= dim_0[i]; out_row *= dim_0[i];
} }
int output_col_0 = outputs[0].numel() / input_row; int out_col = outputs[0].numel() / out_row;
int input_col = 0; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2); framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
framework::Vector<int> outputs_cols(num + 1); framework::Vector<int> outputs_cols(o_num + 1);
outputs_cols[0] = 0;
T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data()); T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
for (int i = 0; i < num; ++i) { outputs_cols[0] = 0;
int t_col = outputs[i].numel() / input_row; for (int i = 0; i < o_num; ++i) {
int t_col = outputs[i].numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != output_col_0) sameShape = false; if (t_col != out_col) sameShape = false;
} }
input_col += t_col; in_col += t_col;
outputs_cols[i + 1] = input_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs[i].data<T>(); outputs_ptr[i] = outputs[i].data<T>();
} }
T** outs_gpu = T** dev_out_gpu_data =
reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace())); reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
// computation // computation
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. if (in_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((input_col + 31) >> 5) << 5; block_cols = ((in_col + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols; int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1); dim3 block_size = dim3(block_cols, block_rows, 1);
...@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols = int grid_cols =
std::min((input_col + block_cols - 1) / block_cols, max_blocks); std::min((in_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows = int grid_rows =
std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, output_col_0, outs_gpu); input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
} else { } else {
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, outs_col_gpu, input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), outs_gpu); static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
} }
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
namespace paddle { namespace paddle {
......
...@@ -19,13 +19,6 @@ limitations under the License. */ ...@@ -19,13 +19,6 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_ATLAS
extern "C" {
#include <cblas.h>
#include <clapack.h>
}
#endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#include <lapacke.h> #include <lapacke.h>
......
...@@ -19,8 +19,17 @@ namespace paddle { ...@@ -19,8 +19,17 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T>
class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> { class MaxSeqPoolFunctor {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output, const framework::LoDTensor& input, framework::Tensor* output,
...@@ -60,7 +69,7 @@ class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> { ...@@ -60,7 +69,7 @@ class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
}; };
template <typename T> template <typename T>
class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> { class MaxSeqPoolGradFunctor {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& out_grad, const framework::Tensor& out_grad,
...@@ -93,10 +102,101 @@ class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> { ...@@ -93,10 +102,101 @@ class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
} }
}; };
template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>; template <typename T>
template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>; class SequencePoolFunctor<platform::CPUDeviceContext, T> {
template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>; public:
template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>; /* max pool has index output */
void operator()(const platform::CPUDeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output,
framework::Tensor* index = nullptr) {
if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<T> max_pool;
max_pool(context, input, output, index);
return;
}
auto lod = input.lod()[0];
auto& place = *context.eigen_device();
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
Tensor in_t =
input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
Tensor out_t = output->Slice(i, i + 1);
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
int64_t w = input.numel() / input.dims()[0];
auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
auto out_e = EigenVector<T>::Flatten(out_t);
if (pooltype == "AVERAGE") {
out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SUM") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h));
} else if (pooltype == "LAST") {
out_e.device(place) = in_e.chip(h - 1, 0);
} else if (pooltype == "FIRST") {
out_e.device(place) = in_e.chip(0, 0);
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
}
}
};
template <typename T>
class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const std::string pooltype, const framework::Tensor& out_grad,
framework::LoDTensor* in_grad,
/* max pool has index */
const framework::Tensor* index = nullptr) {
if (pooltype == "MAX") {
math::MaxSeqPoolGradFunctor<T> max_pool_grad;
max_pool_grad(context, out_grad, *index, in_grad);
return;
}
if (pooltype == "LAST" || pooltype == "FIRST") {
// set X@Grad be zero at first when pooltype is LAST/FIRST
math::SetConstant<platform::CPUDeviceContext, T> functor;
functor(context, in_grad, 0);
}
auto lod = in_grad->lod()[0];
auto& place = *context.eigen_device();
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
static_cast<int>(lod[i + 1]));
auto out_g_t = out_grad.Slice(i, i + 1);
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
int64_t w = in_grad->numel() / in_grad->dims()[0];
auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
Eigen::DSizes<int, 2> bcast(h, 1);
if (pooltype == "AVERAGE") {
in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
} else if (pooltype == "SUM") {
in_g_e.device(place) = (out_g_e).broadcast(bcast);
} else if (pooltype == "SQRT") {
in_g_e.device(place) =
(out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
} else if (pooltype == "LAST") {
in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
} else if (pooltype == "FIRST") {
in_g_e.chip(0, 0).device(place) = out_g_e_v;
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
}
}
};
template class SequencePoolFunctor<platform::CPUDeviceContext, float>;
template class SequencePoolFunctor<platform::CPUDeviceContext, double>;
template class SequencePoolGradFunctor<platform::CPUDeviceContext, float>;
template class SequencePoolGradFunctor<platform::CPUDeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -22,113 +23,331 @@ namespace math { ...@@ -22,113 +23,331 @@ namespace math {
#define FLT_MAX __FLT_MAX__ #define FLT_MAX __FLT_MAX__
template <typename T> template <typename T>
__global__ void KeMaxSequencePool(const T* input, const size_t* starts, struct MaxPoolFunctor {
T* output, int* index, int64_t num_seq, HOSTDEVICE void operator()(const T* input, const size_t start,
int64_t dim) { const size_t end, const size_t item_dim, T* output,
int dim_idx = threadIdx.x; int* index) {
int seq_id = blockIdx.x; for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
if (seq_id >= num_seq) return;
size_t start = starts[seq_id];
size_t end = starts[seq_id + 1];
for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
T max_val = static_cast<T>(-FLT_MAX); T max_val = static_cast<T>(-FLT_MAX);
int max_id = -1; int max_index = -1;
for (size_t step_id = start; step_id < end; step_id++) { for (int i = start; i < end; ++i) {
if (max_val < input[step_id * dim + i]) { if (max_val < input[item_dim * i + tid]) {
max_val = input[step_id * dim + i]; max_val = input[item_dim * i + tid];
max_id = step_id; max_index = i;
} }
} }
output[seq_id * dim + i] = max_val; output[tid] = max_val;
index[seq_id * dim + i] = max_id; index[tid] = max_index;
} }
}
};
template <typename T>
struct AvgPoolFunctor {
HOSTDEVICE void operator()(const T* input, const size_t start,
const size_t end, const size_t item_dim, T* output,
int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
T val = static_cast<T>(0);
for (int i = start; i < end; ++i) {
val += input[item_dim * i + tid];
}
// end, start is lod, so end - start != 0
output[tid] = val / static_cast<T>(end - start);
}
}
};
template <typename T>
struct SumPoolFunctor {
HOSTDEVICE void operator()(const T* input, const size_t start,
const size_t end, const size_t item_dim, T* output,
int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
T val = static_cast<T>(0);
for (int i = start; i < end; ++i) {
val += input[item_dim * i + tid];
}
output[tid] = val;
}
}
};
template <typename T>
struct SqrtPoolFunctor {
HOSTDEVICE void operator()(const T* input, const size_t start,
const size_t end, const size_t item_dim, T* output,
int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
T val = static_cast<T>(0);
for (int i = start; i < end; ++i) {
val += input[item_dim * i + tid];
}
// end, start is lod, so end - start != 0
output[tid] = val / sqrt(end - start);
}
}
};
template <typename T>
struct LastPoolFunctor {
HOSTDEVICE void operator()(const T* input, const size_t start,
const size_t end, const size_t item_dim, T* output,
int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
output[tid] = input[item_dim * (end - 1) + tid];
}
}
};
template <typename T>
struct FirstPoolFunctor {
HOSTDEVICE void operator()(const T* input, const size_t start,
const size_t end, const size_t item_dim, T* output,
int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
output[tid] = input[item_dim * start + tid];
}
}
};
template <typename T, typename Range_OP>
__global__ void sequence_pool_kernel(Range_OP op, const T* input,
const size_t* lod, const size_t lod_size,
const size_t item_dim, T* output,
int* index) {
int bid = blockIdx.x;
if (bid >= lod_size - 1) return;
size_t start = lod[bid];
size_t end = lod[bid + 1];
int* index_offset = nullptr;
if (index != nullptr) {
index_offset = &index[bid * item_dim];
}
op(input, start, end, item_dim, &output[bid * item_dim], index_offset);
} }
template <typename T> template <typename T>
class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> { class SequencePoolFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output, const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* index) { framework::Tensor* output,
auto in_dims = input.dims(); framework::Tensor* index = nullptr) {
auto out_dims = output->dims(); auto lod = input.lod()[0];
auto idx_dims = index->dims(); const size_t item_dim = output->numel() / output->dims()[0];
PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1)); dim3 threads(1024, 1);
PADDLE_ENFORCE_GT(out_dims.size(), 1); dim3 grid(lod.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) { if (pooltype == "MAX") {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); sequence_pool_kernel<
} T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
PADDLE_ENFORCE_EQ(idx_dims, out_dims); MaxPoolFunctor<T>(), input.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
auto starts = input.lod()[0]; output->mutable_data<T>(context.GetPlace()), index->data<int>());
const T* in_data = input.data<T>(); } else if (pooltype == "AVERAGE") {
T* out_data = output->data<T>(); sequence_pool_kernel<
int* max_index = index->data<int>(); T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
AvgPoolFunctor<T>(), input.data<T>(),
int64_t num_seq = out_dims[0]; lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
int64_t dim = output->numel() / num_seq; output->mutable_data<T>(context.GetPlace()), nullptr);
} else if (pooltype == "SUM") {
dim3 threads(256, 1); sequence_pool_kernel<
dim3 grid(num_seq, 1); T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
auto stream = context.stream(); SumPoolFunctor<T>(), input.data<T>(),
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>( lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
in_data, starts.CUDAData(context.GetPlace()), out_data, max_index, output->mutable_data<T>(context.GetPlace()), nullptr);
num_seq, dim); } else if (pooltype == "SQRT") {
sequence_pool_kernel<
T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
SqrtPoolFunctor<T>(), input.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
output->mutable_data<T>(context.GetPlace()), nullptr);
} else if (pooltype == "LAST") {
sequence_pool_kernel<
T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
LastPoolFunctor<T>(), input.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
output->mutable_data<T>(context.GetPlace()), nullptr);
} else if (pooltype == "FIRST") {
sequence_pool_kernel<
T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
FirstPoolFunctor<T>(), input.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
output->mutable_data<T>(context.GetPlace()), nullptr);
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
} }
}; };
template <typename T> template <typename T>
__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, struct MaxPoolGradFunctor {
T* in_grad, int64_t num_seq, HOSTDEVICE void operator()(const T* out_grad, const size_t start,
int64_t dim) { const size_t end, const size_t item_dim,
int idx = threadIdx.x + blockIdx.x * blockDim.x; T* in_grad, const int* index) {
int col_idx = idx % dim; for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
if (idx < num_seq * dim) { for (int i = start; i < end; ++i) {
int step_id = max_index[idx]; if (i == index[tid]) {
in_grad[step_id * dim + col_idx] = out_grad[idx]; in_grad[item_dim * i + tid] = out_grad[tid];
} else {
in_grad[item_dim * i + tid] = static_cast<T>(0);
} }
}
}
}
};
template <typename T>
struct AvgPoolGradFunctor {
HOSTDEVICE void operator()(const T* out_grad, const size_t start,
const size_t end, const size_t item_dim,
T* in_grad, const int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
for (int i = start; i < end; ++i) {
in_grad[item_dim * i + tid] = out_grad[tid] / (end - start);
}
}
}
};
template <typename T>
struct SumPoolGradFunctor {
HOSTDEVICE void operator()(const T* out_grad, const size_t start,
const size_t end, const size_t item_dim,
T* in_grad, const int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
for (int i = start; i < end; ++i) {
in_grad[item_dim * i + tid] = out_grad[tid];
}
}
}
};
template <typename T>
struct SqrtPoolGradFunctor {
HOSTDEVICE void operator()(const T* out_grad, const size_t start,
const size_t end, const size_t item_dim,
T* in_grad, const int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
for (int i = start; i < end; ++i) {
in_grad[item_dim * i + tid] =
out_grad[tid] / (sqrt(static_cast<T>(end - start)));
}
}
}
};
template <typename T>
struct LastPoolGradFunctor {
HOSTDEVICE void operator()(const T* out_grad, const size_t start,
const size_t end, const size_t item_dim,
T* in_grad, const int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
for (int i = start; i < end; ++i) {
if (i == end - 1) {
in_grad[item_dim * i + tid] = out_grad[tid];
} else {
in_grad[item_dim * i + tid] = static_cast<T>(0);
}
}
}
}
};
template <typename T>
struct FirstPoolGradFunctor {
HOSTDEVICE void operator()(const T* out_grad, const size_t start,
const size_t end, const size_t item_dim,
T* in_grad, const int* index) {
for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
for (int i = start; i < end; ++i) {
if (i == start) {
in_grad[item_dim * i + tid] = out_grad[tid];
} else {
in_grad[item_dim * i + tid] = static_cast<T>(0);
}
}
}
}
};
template <typename T, typename Range_OP>
__global__ void sequence_pool_grad_kernel(Range_OP op, const T* out_grad,
const size_t* lod,
const size_t lod_size,
const size_t item_dim, T* in_grad,
const int* index) {
int bid = blockIdx.x;
if (bid >= lod_size - 1) return;
size_t start = lod[bid];
size_t end = lod[bid + 1];
const int* index_offset = nullptr;
if (index != nullptr) {
index_offset = &index[bid * item_dim];
}
op(&out_grad[bid * item_dim], start, end, item_dim, in_grad, index_offset);
} }
template <typename T> template <typename T>
class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> { class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& out_grad, const std::string pooltype, const framework::Tensor& out_grad,
const framework::Tensor& index, framework::LoDTensor* in_grad,
framework::LoDTensor* in_grad) { /* max pool has index */
auto og_dims = out_grad.dims(); const framework::Tensor* index = nullptr) {
auto idx_dims = index.dims(); auto lod = in_grad->lod()[0];
auto ig_dims = in_grad->dims(); const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1)); dim3 threads(1024, 1);
PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1)); dim3 grid(lod.size(), 1);
for (int64_t i = 1; i < og_dims.size(); ++i) { if (pooltype == "MAX") {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); sequence_pool_grad_kernel<
} T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
PADDLE_ENFORCE_EQ(idx_dims, og_dims); MaxPoolGradFunctor<T>(), out_grad.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
const T* og_data = out_grad.data<T>(); in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
const int* max_index = index.data<int>(); } else if (pooltype == "AVERAGE") {
T* ig_data = in_grad->data<T>(); sequence_pool_grad_kernel<
T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
SetConstant<platform::CUDADeviceContext, T> set_zero; AvgPoolGradFunctor<T>(), out_grad.data<T>(),
set_zero(context, in_grad, static_cast<T>(0.0)); lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
int64_t num_seq = og_dims[0]; in_grad->mutable_data<T>(context.GetPlace()), nullptr);
int64_t dim = out_grad.numel() / num_seq; } else if (pooltype == "SUM") {
sequence_pool_grad_kernel<
unsigned int blocks = (num_seq * dim + 128 - 1) / 128; T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
dim3 threads(128, 1); SumPoolGradFunctor<T>(), out_grad.data<T>(),
dim3 grid(blocks, 1); lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
auto stream = context.stream(); in_grad->mutable_data<T>(context.GetPlace()), nullptr);
KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>( } else if (pooltype == "SQRT") {
og_data, max_index, ig_data, num_seq, dim); sequence_pool_grad_kernel<
T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
in_grad->mutable_data<T>(context.GetPlace()), nullptr);
} else if (pooltype == "LAST") {
sequence_pool_grad_kernel<
T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
LastPoolGradFunctor<T>(), out_grad.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
in_grad->mutable_data<T>(context.GetPlace()), nullptr);
} else if (pooltype == "FIRST") {
sequence_pool_grad_kernel<
T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
FirstPoolGradFunctor<T>(), out_grad.data<T>(),
lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
in_grad->mutable_data<T>(context.GetPlace()), nullptr);
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
} }
}; };
template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>; // sequence pooling
template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>; template class SequencePoolFunctor<platform::CUDADeviceContext, float>;
template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>; template class SequencePoolFunctor<platform::CUDADeviceContext, double>;
template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>; template class SequencePoolGradFunctor<platform::CUDADeviceContext, float>;
template class SequencePoolGradFunctor<platform::CUDADeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -21,23 +21,23 @@ namespace paddle { ...@@ -21,23 +21,23 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#define FLT_MAX __FLT_MAX__
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class MaxSeqPoolFunctor { class SequencePoolFunctor {
public: public:
void operator()(const DeviceContext& context, /* max pool has index output */
void operator()(const DeviceContext& context, const std::string pooltype,
const framework::LoDTensor& input, framework::Tensor* output, const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index); framework::Tensor* index = nullptr);
}; };
template <typename DeviceContext, class T> template <typename DeviceContext, typename T>
class MaxSeqPoolGradFunctor { class SequencePoolGradFunctor {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context, const std::string pooltype,
const framework::Tensor& out_grad, const framework::Tensor& out_grad,
const framework::Tensor& index, framework::LoDTensor* in_grad,
framework::LoDTensor* in_grad); /* max pool has index */
const framework::Tensor* index = nullptr);
}; };
} // namespace math } // namespace math
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename Functor>
class MKLDNNActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
"Cannot get input tensor X, variable name = %s",
context.op().Input("X"));
PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
"Cannot find output tensor Out, variable name = %s",
context.op().Output("Out"));
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename Functor>
class MKLDNNActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
namespace {
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper) {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
}
#endif
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.GetPlace(), layout, library);
}
} // anonymous namespace
class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this);
}
};
class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this);
}
};
} // namespace operators
} // namespace paddle
...@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase {
PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(), PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
"Only support parameter type as LoDTensor"); "Only support parameter type as LoDTensor");
auto &src = scope.FindVar(param)->Get<LoDTensor>(); auto &src = scope.FindVar(param)->Get<LoDTensor>();
for (size_t i = 0; i < sub_scopes.size(); ++i) {
auto *sub_scope0 = sub_scopes[0];
auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
dst0->ShareDataWith(src);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
auto &place = places[i]; auto &place = places[i];
auto *sub_scope = sub_scopes[i]; auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>(); auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future>
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/send_recv_util.h"
namespace paddle {
namespace operators {
class PrefetchOp : public framework::OperatorBase {
public:
PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto ins = Inputs("X");
auto outs = Outputs("Out");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
<< outs[i] << "back";
rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
outs[i]);
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
}
}
PADDLE_ENFORCE(rpc_client->Wait());
}
};
class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
public:
PrefetchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which will be"
"initialized at most once.");
AddOutput("Out",
"(SelectedRows) result "
"to be fetched from parameter server")
.AsDuplicable();
AddAttr<std::vector<std::string>>(
"epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping")
.SetDefault({"127.0.0.1:6164"});
AddComment(R"DOC(
Prefetch operator
This operator will send Ids variables to listen_and_serve op at
the parameter server and fetch result back.
)DOC");
}
};
class PrefetchOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class PrefetchOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
ops::PrefetchOpVarTypeInference,
ops::PrefetchOpShapeInference);
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -59,7 +60,9 @@ class ReadOp : public framework::OperatorBase { ...@@ -59,7 +60,9 @@ class ReadOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
framework::ReaderHolder* reader = framework::ReaderHolder* reader =
scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>(); detail::Ref(scope.FindVar(Input("Reader")),
"Cannot find reader variable %s", Input("Reader"))
.GetMutable<framework::ReaderHolder>();
std::vector<std::string> out_arg_names = Outputs("Out"); std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
......
...@@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() { ...@@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() {
std::swap(gpu_batch, batch.payloads_); std::swap(gpu_batch, batch.payloads_);
} }
if (!buffer_->Send(&batch)) { try {
buffer_->Send(&batch);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The " VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread will terminate."; "prefetch thread will terminate.";
break; break;
......
...@@ -81,10 +81,10 @@ class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -81,10 +81,10 @@ class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
This operator creates a multi-pass reader. A multi-pass reader This operator creates a multi-pass reader. A multi-pass reader
is used to yield data for several pass training continuously. is used to yield data for several pass training continuously.
It takes the the number of pass to run as one of its attributes It takes the number of passes to run as one of its attributes
('pass_num'), and maintains a pass counter to record how many ('pass_num'), and maintains a pass counter to record how many
passes it has completed. When the underlying reader reach the EOF, passes it has completed. When the underlying reader reaches the
the multi-pass reader checks whether it has completed training EOF, the multi-pass reader checks whether it has completed training
of the given number of pass. If not, the underlying reader will of the given number of pass. If not, the underlying reader will
be re-initialized and starts a new pass automatically. be re-initialized and starts a new pass automatically.
)DOC"); )DOC");
......
...@@ -12,12 +12,15 @@ ...@@ -12,12 +12,15 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <mutex>
#include <thread>
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
template <bool ThreadSafe>
class RecordIOFileReader : public framework::FileReader { class RecordIOFileReader : public framework::FileReader {
public: public:
explicit RecordIOFileReader(const std::string& filename, explicit RecordIOFileReader(const std::string& filename,
...@@ -25,7 +28,12 @@ class RecordIOFileReader : public framework::FileReader { ...@@ -25,7 +28,12 @@ class RecordIOFileReader : public framework::FileReader {
: FileReader(dims), : FileReader(dims),
scanner_(filename), scanner_(filename),
dev_ctx_(*platform::DeviceContextPool::Instance().Get( dev_ctx_(*platform::DeviceContextPool::Instance().Get(
platform::CPUPlace())) {} platform::CPUPlace())) {
if (ThreadSafe) {
mutex_.reset(new std::mutex());
}
LOG(INFO) << "Creating file reader" << filename;
}
bool HasNext() const override { return scanner_.HasNext(); } bool HasNext() const override { return scanner_.HasNext(); }
...@@ -33,10 +41,16 @@ class RecordIOFileReader : public framework::FileReader { ...@@ -33,10 +41,16 @@ class RecordIOFileReader : public framework::FileReader {
protected: protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override { void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
if (ThreadSafe) {
std::lock_guard<std::mutex> guard(*mutex_);
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
} else {
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_); *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
} }
}
private: private:
std::unique_ptr<std::mutex> mutex_;
recordio::Scanner scanner_; recordio::Scanner scanner_;
const platform::DeviceContext& dev_ctx_; const platform::DeviceContext& dev_ctx_;
}; };
...@@ -59,8 +73,9 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { ...@@ -59,8 +73,9 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
out->Reset(
new RecordIOFileReader(filename, RestoreShapes(shape_concat, ranks))); out->Reset(new RecordIOFileReader<true>(
filename, RestoreShapes(shape_concat, ranks)));
} }
}; };
...@@ -87,4 +102,4 @@ REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader, ...@@ -87,4 +102,4 @@ REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader,
reader::CreateRecordIOReaderOp, reader::CreateRecordIOReaderOp,
reader::CreateRecordIOReaderOpMaker); reader::CreateRecordIOReaderOpMaker);
REGISTER_FILE_READER(recordio, reader::RecordIOFileReader); REGISTER_FILE_READER(recordio, reader::RecordIOFileReader<false>);
...@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name, ...@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name,
while (reader->HasNext()) { while (reader->HasNext()) {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (!buffer_->Send(&ins)) { try {
buffer_->Send(&ins);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch " VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
"thread of file '" "thread of file '"
<< file_name << "' will terminate."; << file_name << "' will terminate.";
break; break;
} }
} }
if (!available_thread_idx_->Send(&thread_idx)) {
try {
available_thread_idx_->Send(&thread_idx);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. " VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
"Fail to send thread_idx."; "Fail to send thread_idx.";
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
class SendBarrierOp : public framework::OperatorBase {
public:
SendBarrierOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
// need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait());
}
};
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("RPCClient",
"(RPCClient) The RPC client object which is"
"initialized at most once.");
AddComment(R"DOC(
SendBarrier operator
This operator will send a send barrier signal to list_and_serv op, so that
the Parameter Server would knew all variables have been sent.
)DOC");
AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SendBarrierOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
ops::SendBarrierOpVarTypeInference,
ops::SendBarrierOpShapeInference);
...@@ -12,35 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,35 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future>
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static bool NeedSend(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
varname);
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else {
PADDLE_THROW(
"Variable type in send side should be in "
"[LodTensor, SelectedRows]");
}
return false;
}
class SendOp : public framework::OperatorBase { class SendOp : public framework::OperatorBase {
public: public:
...@@ -72,7 +56,7 @@ class SendOp : public framework::OperatorBase { ...@@ -72,7 +56,7 @@ class SendOp : public framework::OperatorBase {
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(2) << "sending " << ins[i] << " to " << epmap[i]; VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
...@@ -81,7 +65,7 @@ class SendOp : public framework::OperatorBase { ...@@ -81,7 +65,7 @@ class SendOp : public framework::OperatorBase {
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : endpoints) { for (auto& ep : endpoints) {
VLOG(2) << "batch barrier, ep: " << ep; VLOG(3) << "batch barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
......
...@@ -122,7 +122,8 @@ void StartServerNet(bool is_sparse) { ...@@ -122,7 +122,8 @@ void StartServerNet(bool is_sparse) {
// sub program run in listen_and_serv_op, for simple test we use sum // sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program; f::ProgramDesc program;
f::BlockDesc *optimize_block = program.MutableBlock(0); const auto &root_block = program.Block(0);
auto *optimize_block = program.AppendBlock(root_block);
// X for server side tensors, RX for received tensers, must be of same shape. // X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
namespace paddle {
namespace operators {
inline bool NeedSend(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
varname);
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else {
PADDLE_THROW(
"Variable type in send side should be in "
"[LodTensor, SelectedRows]");
}
return false;
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future>
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/send_recv_util.h"
namespace paddle {
namespace operators {
class SendVarsOp : public framework::OperatorBase {
public:
SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto ins = Inputs("X");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_sent");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
// TODO(Yancey1989): we need to use an IO threadpool which has
// a larger number of threads than the computing threadpool.
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
}
}
if (sync_send) {
rpc_client->Wait();
}
}
};
class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which will be"
"initialized at most once.");
AddComment(R"DOC(
Send operator
This operator will send variables to listen_and_serve op at the parameter server.
)DOC");
AddAttr<int>("sync_send",
"(int, default 0)"
"sync send or async send.")
.SetDefault(0);
AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendVarsOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SendVarsOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
ops::SendVarsOpVarTypeInference,
ops::SendVarsOpShapeInference);
...@@ -23,12 +23,6 @@ namespace operators { ...@@ -23,12 +23,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SequencePoolKernel : public framework::OpKernel<T> { class SequencePoolKernel : public framework::OpKernel<T> {
...@@ -37,11 +31,13 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -37,11 +31,13 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<Tensor>("Out"); auto* out = context.Output<Tensor>("Out");
std::string pooltype = context.Attr<std::string>("pooltype"); std::string pooltype = context.Attr<std::string>("pooltype");
Tensor* index = nullptr;
if (pooltype == "MAX") {
index = context.Output<Tensor>("MaxIndex");
}
auto dims = in->dims(); auto dims = in->dims();
auto lod = in->lod(); auto lod = in->lod();
int64_t w = in->numel() / dims[0];
// InferShape by lod // InferShape by lod
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
...@@ -50,45 +46,14 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -50,45 +46,14 @@ class SequencePoolKernel : public framework::OpKernel<T> {
"The first dimension of Input(X) must be large than batch size."); "The first dimension of Input(X) must be large than batch size.");
dims[0] = lod[0].size() - 1; dims[0] = lod[0].size() - 1;
out->Resize({dims}); out->Resize({dims});
auto lod_level_0 = lod[0];
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
if (pooltype == "MAX") { if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
auto* index = context.Output<Tensor>("MaxIndex");
index->Resize({dims}); index->Resize({dims});
index->mutable_data<int>(context.GetPlace()); index->mutable_data<int>(context.GetPlace());
max_pool(dev_ctx, *in, out, index);
return;
}
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
static_cast<int>(lod_level_0[i + 1]));
Tensor out_t = out->Slice(i, i + 1);
int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
auto out_e = EigenVector<T>::Flatten(out_t);
if (pooltype == "AVERAGE") {
out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SUM") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h));
} else if (pooltype == "LAST") {
out_e.device(place) = in_e.chip(h - 1, 0);
} else if (pooltype == "FIRST") {
out_e.device(place) = in_e.chip(0, 0);
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
} }
math::SequencePoolFunctor<DeviceContext, T> pool;
pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
index);
} }
}; };
...@@ -96,58 +61,17 @@ template <typename DeviceContext, typename T> ...@@ -96,58 +61,17 @@ template <typename DeviceContext, typename T>
class SequencePoolGradKernel : public framework::OpKernel<T> { class SequencePoolGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out_g = context.Input<Tensor>(framework::GradVarName("Out")); auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X")); auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
std::string pooltype = context.Attr<std::string>("pooltype"); std::string pooltype = context.Attr<std::string>("pooltype");
const Tensor* index = nullptr;
auto dims = in->dims();
auto lod = in->lod()[0];
int64_t w = in->numel() / dims[0];
in_g->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
if (pooltype == "MAX") { if (pooltype == "MAX") {
math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad; index = context.Input<Tensor>("MaxIndex");
auto* index = context.Input<Tensor>("MaxIndex");
max_pool_grad(dev_ctx, *out_g, *index, in_g);
return;
}
if (pooltype == "LAST" || pooltype == "FIRST") {
// set X@Grad be zero at first when pooltype is LAST/FIRST
math::SetConstant<DeviceContext, T> functor;
functor(dev_ctx, in_g, 0);
}
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
auto in_g_t =
in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
auto out_g_t = out_g->Slice(i, i + 1);
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
Eigen::DSizes<int, 2> bcast(h, 1);
if (pooltype == "AVERAGE") {
in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
} else if (pooltype == "SUM") {
in_g_e.device(place) = (out_g_e).broadcast(bcast);
} else if (pooltype == "SQRT") {
in_g_e.device(place) =
(out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
} else if (pooltype == "LAST") {
in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
} else if (pooltype == "FIRST") {
in_g_e.chip(0, 0).device(place) = out_g_e_v;
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
} }
in_g->mutable_data<T>(context.GetPlace());
math::SequencePoolGradFunctor<DeviceContext, T> pool;
pool(context.template device_context<DeviceContext>(), pooltype, *out_g,
in_g, index);
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/split_ids_op.h"
namespace paddle {
namespace operators {
class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
.AsDuplicable();
AddComment(R"DOC(
Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
Example:
Input:
X = [1,2,3,4,5,6]
Out(3 output):
out0 = [3, 6]
out1 = [1, 4]
out2 = [2, 5]
)DOC");
}
};
class SplitIdsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front();
PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
};
class SplitIdsOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &out_var : op_desc.Output("Out")) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
ops::SplitIdsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SplitIdsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
if (!platform::is_cpu_place(place)) {
PADDLE_THROW("SplitIds do not support GPU kernel");
}
const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
auto& ids_dims = ids_t->dims();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const T* ids = ids_t->data<T>();
const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size());
// split id by their shard_num.
for (size_t i = 0; i < ids_dims[0]; ++i) {
T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id);
}
// create tensor for each shard and send to parameter server
for (size_t i = 0; i < out_ids.size(); ++i) {
auto* shard_t = outs[i];
std::vector<T> ids = out_ids[i];
auto* shard_data = shard_t->mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
for (size_t i = 0; i < ids.size(); ++i) {
shard_data[i] = ids[i];
}
}
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thread>
#include <typeindex>
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
inline ncclDataType_t ToNCCLDataType(std::type_index type) {
if (type == typeid(float)) { // NOLINT
return ncclFloat;
} else if (type == typeid(double)) { // NOLINT
return ncclDouble;
} else if (type == typeid(int)) { // NOLINT
return ncclInt;
} else {
PADDLE_THROW("Not supported");
}
}
class NCCLGroupGuard {
public:
inline NCCLGroupGuard() {
mutex().lock();
PADDLE_ENFORCE(dynload::ncclGroupStart());
}
inline ~NCCLGroupGuard() {
PADDLE_ENFORCE(dynload::ncclGroupEnd());
mutex().unlock();
}
private:
static std::mutex &mutex() {
static std::mutex mtx;
return mtx;
}
};
struct NCCLContext {
std::unique_ptr<CUDADeviceContext> ctx_;
ncclComm_t comm_;
explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
cudaStream_t stream() const { return ctx_->stream(); }
int device_id() const {
return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
}
static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
const std::vector<platform::Place> &places) {
std::vector<ncclComm_t> comms;
std::vector<int> devs;
comms.resize(contexts.size());
devs.reserve(contexts.size());
for (auto &p : places) {
devs.push_back(boost::get<platform::CUDAPlace>(p).device);
}
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(contexts.size()), &devs[0]));
int i = 0;
for (auto &dev_id : devs) {
contexts.at(dev_id).comm_ = comms[i++];
}
}
};
struct NCCLContextMap {
std::unordered_map<int, NCCLContext> contexts_;
std::vector<int> order_;
NCCLContextMap(const std::vector<platform::Place> &places) {
order_.reserve(places.size());
for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device;
order_.emplace_back(dev_id);
contexts_.emplace(dev_id, NCCLContext(dev_id));
}
PADDLE_ENFORCE_EQ(
order_.size(), contexts_.size(),
"NCCL Context Map does not support contain two or more same device");
std::vector<ncclComm_t> comms;
comms.resize(order_.size());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(order_.size()), &order_[0]));
int i = 0;
for (auto &dev_id : order_) {
contexts_.at(dev_id).comm_ = comms[i++];
}
}
CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
CUDADeviceContext *DevCtx(platform::Place p) const {
return DevCtx(boost::get<CUDAPlace>(p).device);
}
const NCCLContext &at(platform::Place p) const {
return this->at(boost::get<CUDAPlace>(p).device);
}
const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
void WaitAll() {
for (auto &p : contexts_) {
p.second.ctx_->Wait();
}
}
};
} // namespace platform
} // namespace paddle
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#include "cuda_runtime.h"
#endif
#include "gtest/gtest.h" #include "gtest/gtest.h"
TEST(Event, CpuElapsedTime) { TEST(Event, CpuElapsedTime) {
...@@ -157,3 +160,13 @@ TEST(RecordEvent, RecordEvent) { ...@@ -157,3 +160,13 @@ TEST(RecordEvent, RecordEvent) {
// Will remove parsing-related code from test later // Will remove parsing-related code from test later
DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler"); DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
} }
#ifdef PADDLE_WITH_CUDA
TEST(TMP, stream_wait) {
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaStreamSynchronize(stream);
cudaStreamSynchronize(stream);
cudaStreamSynchronize(stream);
}
#endif
...@@ -3,11 +3,13 @@ if(WITH_PYTHON) ...@@ -3,11 +3,13 @@ if(WITH_PYTHON)
hip_library(paddle_pybind SHARED hip_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
parallel_executor
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
else() else()
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
parallel_executor
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -496,6 +497,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -496,6 +497,20 @@ All parameter, weight, gradient are variables in Paddle.
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
py::class_<ParallelExecutor>(m, "ParallelExecutor")
.def("__init__",
[](ParallelExecutor &self, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program,
const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope) {
new (&self) ParallelExecutor(num_threads, use_event, places,
params, startup_program, main_program,
loss_var_name, scope);
})
.def("run", &ParallelExecutor::Run);
BindRecordIOWriter(m); BindRecordIOWriter(m);
return m.ptr(); return m.ptr();
} }
......
...@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr; ...@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr;
} __name; // struct DynLoad__##__name } __name; // struct DynLoad__##__name
#endif #endif
#ifdef PADDLE_USE_ATLAS #define PADDLE_SGETRF LAPACKE_sgetrf
#define PADDLE_SGETRF clapack_sgetrf #define PADDLE_DGETRF LAPACKE_dgetrf
#define PADDLE_DGETRF clapack_dgetrf #define PADDLE_SGETRI LAPACKE_sgetri
#define PADDLE_SGETRI clapack_sgetri #define PADDLE_DGETRI LAPACKE_dgetri
#define PADDLE_DGETRI clapack_dgetri
#else
#define PADDLE_SGETRF LAPACKE_sgetrf
#define PADDLE_DGETRF LAPACKE_dgetrf
#define PADDLE_SGETRI LAPACKE_sgetri
#define PADDLE_DGETRI LAPACKE_dgetri
#endif
#define LAPACK_ROUTINE_EACH(__macro) \ #define LAPACK_ROUTINE_EACH(__macro) \
__macro(PADDLE_SGETRF) \ __macro(PADDLE_SGETRF) \
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB) #if defined(PADDLE_USE_VECLIB)
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
#include <clapack.h> #include <clapack.h>
......
...@@ -53,6 +53,7 @@ function cmake_gen() { ...@@ -53,6 +53,7 @@ function cmake_gen() {
-DWITH_FAST_BUNDLE_TEST=ON -DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because # Disable UNITTEST_USE_VIRTUALENV in docker because
...@@ -78,6 +79,7 @@ EOF ...@@ -78,6 +79,7 @@ EOF
-DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \ -DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
} }
...@@ -123,9 +125,8 @@ EOF ...@@ -123,9 +125,8 @@ EOF
-DWITH_AVX=${WITH_AVX:-ON} \ -DWITH_AVX=${WITH_AVX:-ON} \
-DWITH_SWIG_PY=ON \ -DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF -DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py framework_py_proto
make -j `nproc` copy_paddle_pybind make -j `nproc` paddle_docs paddle_apis
make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
popd popd
fi fi
......
...@@ -153,9 +153,15 @@ if [ $? -ne 0 ]; then ...@@ -153,9 +153,15 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
INSTALLED_VERSION=`pip freeze 2>/dev/null | grep '^paddle' | sed 's/.*==//g'` if [ "@WITH_GPU@" == "ON" ]; then
PADDLE_NAME="paddlepaddle-gpu"
else
PADDLE_NAME="paddlepaddle"
fi
INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
if [ -z ${INSTALLED_VERSION} ]; then if [ -z "${INSTALLED_VERSION}" ]; then
INSTALLED_VERSION="0.0.0" # not installed INSTALLED_VERSION="0.0.0" # not installed
fi fi
cat <<EOF | python - cat <<EOF | python -
......
...@@ -7,9 +7,8 @@ cd $TRAVIS_BUILD_DIR/build ...@@ -7,9 +7,8 @@ cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only. # Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py framework_py_proto
make -j `nproc` copy_paddle_pybind make -j `nproc` paddle_docs paddle_apis
make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
# check websites for broken links # check websites for broken links
linkchecker doc/v2/en/html/index.html linkchecker doc/v2/en/html/index.html
......
...@@ -4,7 +4,7 @@ set(PY_FILES paddle/__init__.py ...@@ -4,7 +4,7 @@ set(PY_FILES paddle/__init__.py
${UTILS_PY_FILES} ${UTILS_PY_FILES}
${FLUID_PY_FILES}) ${FLUID_PY_FILES})
if(NOT WITH_FLUID) if(NOT WITH_FLUID_ONLY)
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
...@@ -62,7 +62,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ...@@ -62,7 +62,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
if(NOT WITH_FLUID) if(NOT WITH_FLUID_ONLY)
set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model) set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
list(APPEND paddle_python_deps python_api_wheel) list(APPEND paddle_python_deps python_api_wheel)
...@@ -73,12 +73,13 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) ...@@ -73,12 +73,13 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
if (WITH_TESTING) if (WITH_TESTING)
if(NOT WITH_FLUID) add_subdirectory(paddle/reader/tests)
add_subdirectory(paddle/dataset/tests)
if(NOT WITH_FLUID_ONLY)
add_subdirectory(paddle/trainer_config_helpers/tests) add_subdirectory(paddle/trainer_config_helpers/tests)
if (WITH_SWIG_PY) if (WITH_SWIG_PY)
# enable v2 API unittest only when paddle swig api is compiled # enable v2 API unittest only when paddle swig api is compiled
add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests) add_subdirectory(paddle/v2/plot/tests)
endif() endif()
endif() endif()
......
...@@ -14,8 +14,14 @@ ...@@ -14,8 +14,14 @@
try: try:
from version import full_version as __version__ from version import full_version as __version__
from version import commit as __git_commit__ from version import commit as __git_commit__
except ImportError: except ImportError:
import sys import sys
sys.stderr.write('''Warning with import paddle: you should not sys.stderr.write('''Warning with import paddle: you should not
import paddle from the source directory; please install paddlepaddle*.whl firstly.''' import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
) )
import reader
import dataset
import batch
batch = batch.batch
...@@ -28,6 +28,7 @@ import wmt16 ...@@ -28,6 +28,7 @@ import wmt16
import mq2007 import mq2007
import flowers import flowers
import voc2012 import voc2012
import image
__all__ = [ __all__ = [
'mnist', 'mnist',
...@@ -43,4 +44,5 @@ __all__ = [ ...@@ -43,4 +44,5 @@ __all__ = [
'mq2007', 'mq2007',
'flowers', 'flowers',
'voc2012', 'voc2012',
'image',
] ]
...@@ -31,7 +31,7 @@ images per class. ...@@ -31,7 +31,7 @@ images per class.
import cPickle import cPickle
import itertools import itertools
import numpy import numpy
import paddle.v2.dataset.common import paddle.dataset.common
import tarfile import tarfile
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
...@@ -75,7 +75,7 @@ def train100(): ...@@ -75,7 +75,7 @@ def train100():
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'train') 'train')
...@@ -90,7 +90,7 @@ def test100(): ...@@ -90,7 +90,7 @@ def test100():
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'test') 'test')
...@@ -105,7 +105,7 @@ def train10(): ...@@ -105,7 +105,7 @@ def train10():
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'data_batch') 'data_batch')
...@@ -120,20 +120,20 @@ def test10(): ...@@ -120,20 +120,20 @@ def test10():
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'test_batch') 'test_batch')
def fetch(): def fetch():
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
def convert(path): def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
...@@ -19,7 +19,7 @@ import errno ...@@ -19,7 +19,7 @@ import errno
import shutil import shutil
import sys import sys
import importlib import importlib
import paddle.v2.dataset import paddle.dataset
import cPickle import cPickle
import glob import glob
import cPickle as pickle import cPickle as pickle
...@@ -105,24 +105,24 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -105,24 +105,24 @@ def download(url, module_name, md5sum, save_name=None):
def fetch_all(): def fetch_all():
for module_name in filter(lambda x: not x.startswith("__"), for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)): dir(paddle.dataset)):
if "fetch" in dir( if "fetch" in dir(
importlib.import_module("paddle.v2.dataset.%s" % module_name)): importlib.import_module("paddle.dataset.%s" % module_name)):
getattr( getattr(
importlib.import_module("paddle.v2.dataset.%s" % module_name), importlib.import_module("paddle.dataset.%s" % module_name),
"fetch")() "fetch")()
def fetch_all_recordio(path): def fetch_all_recordio(path):
for module_name in filter(lambda x: not x.startswith("__"), for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)): dir(paddle.dataset)):
if "convert" in dir( if "convert" in dir(
importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ importlib.import_module("paddle.dataset.%s" % module_name)) and \
not module_name == "common": not module_name == "common":
ds_path = os.path.join(path, module_name) ds_path = os.path.join(path, module_name)
must_mkdirs(ds_path) must_mkdirs(ds_path)
getattr( getattr(
importlib.import_module("paddle.v2.dataset.%s" % module_name), importlib.import_module("paddle.dataset.%s" % module_name),
"convert")(ds_path) "convert")(ds_path)
...@@ -130,7 +130,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): ...@@ -130,7 +130,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
""" """
you can call the function as: you can call the function as:
split(paddle.v2.dataset.cifar.train10(), line_count=1000, split(paddle.dataset.cifar.train10(), line_count=1000,
suffix="imikolov-train-%05d.pickle") suffix="imikolov-train-%05d.pickle")
the output files as: the output files as:
......
...@@ -23,7 +23,7 @@ to initialize SRL model. ...@@ -23,7 +23,7 @@ to initialize SRL model.
import tarfile import tarfile
import gzip import gzip
import itertools import itertools
import paddle.v2.dataset.common import paddle.dataset.common
__all__ = ['test, get_dict', 'get_embedding', 'convert'] __all__ = ['test, get_dict', 'get_embedding', 'convert']
...@@ -203,14 +203,11 @@ def get_dict(): ...@@ -203,14 +203,11 @@ def get_dict():
Get the word, verb and label dictionary of Wikipedia corpus. Get the word, verb and label dictionary of Wikipedia corpus.
""" """
word_dict = load_dict( word_dict = load_dict(
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
WORDDICT_MD5))
verb_dict = load_dict( verb_dict = load_dict(
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
VERBDICT_MD5))
label_dict = load_label_dict( label_dict = load_label_dict(
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
TRGDICT_MD5))
return word_dict, verb_dict, label_dict return word_dict, verb_dict, label_dict
...@@ -218,7 +215,7 @@ def get_embedding(): ...@@ -218,7 +215,7 @@ def get_embedding():
""" """
Get the trained word vector based on Wikipedia corpus. Get the trained word vector based on Wikipedia corpus.
""" """
return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
def test(): def test():
...@@ -235,23 +232,23 @@ def test(): ...@@ -235,23 +232,23 @@ def test():
""" """
word_dict, verb_dict, label_dict = get_dict() word_dict, verb_dict, label_dict = get_dict()
reader = corpus_reader( reader = corpus_reader(
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
return reader_creator(reader, word_dict, verb_dict, label_dict) return reader_creator(reader, word_dict, verb_dict, label_dict)
def fetch(): def fetch():
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
def convert(path): def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
...@@ -34,8 +34,8 @@ import functools ...@@ -34,8 +34,8 @@ import functools
from common import download from common import download
import tarfile import tarfile
import scipy.io as scio import scipy.io as scio
from paddle.v2.image import * from paddle.dataset.image import *
from paddle.v2.reader import * from paddle.reader import *
import os import os
import numpy as np import numpy as np
from multiprocessing import cpu_count from multiprocessing import cpu_count
......
...@@ -20,7 +20,7 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing. ...@@ -20,7 +20,7 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary. Besides, this module also provides API for building dictionary.
""" """
import paddle.v2.dataset.common import paddle.dataset.common
import collections import collections
import tarfile import tarfile
import re import re
...@@ -37,8 +37,7 @@ def tokenize(pattern): ...@@ -37,8 +37,7 @@ def tokenize(pattern):
Read files that match the given pattern. Tokenize and yield each file. Read files that match the given pattern. Tokenize and yield each file.
""" """
with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf:
MD5)) as tarf:
# Note that we should use tarfile.next(), which does # Note that we should use tarfile.next(), which does
# sequential access of member files, other than # sequential access of member files, other than
# tarfile.extractfile, which does random access and might # tarfile.extractfile, which does random access and might
...@@ -136,7 +135,7 @@ def word_dict(): ...@@ -136,7 +135,7 @@ def word_dict():
def fetch(): def fetch():
paddle.v2.dataset.common.download(URL, 'imdb', MD5) paddle.dataset.common.download(URL, 'imdb', MD5)
def convert(path): def convert(path):
...@@ -144,5 +143,5 @@ def convert(path): ...@@ -144,5 +143,5 @@ def convert(path):
Converts dataset to recordio format Converts dataset to recordio format
""" """
w = word_dict() w = word_dict()
paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
...@@ -18,7 +18,7 @@ This module will download dataset from ...@@ -18,7 +18,7 @@ This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators. into paddle reader creators.
""" """
import paddle.v2.dataset.common import paddle.dataset.common
import collections import collections
import tarfile import tarfile
...@@ -54,9 +54,9 @@ def build_dict(min_word_freq=50): ...@@ -54,9 +54,9 @@ def build_dict(min_word_freq=50):
train_filename = './simple-examples/data/ptb.train.txt' train_filename = './simple-examples/data/ptb.train.txt'
test_filename = './simple-examples/data/ptb.valid.txt' test_filename = './simple-examples/data/ptb.valid.txt'
with tarfile.open( with tarfile.open(
paddle.v2.dataset.common.download( paddle.dataset.common.download(paddle.dataset.imikolov.URL,
paddle.v2.dataset.imikolov.URL, 'imikolov', 'imikolov',
paddle.v2.dataset.imikolov.MD5)) as tf: paddle.dataset.imikolov.MD5)) as tf:
trainf = tf.extractfile(train_filename) trainf = tf.extractfile(train_filename)
testf = tf.extractfile(test_filename) testf = tf.extractfile(test_filename)
word_freq = word_count(testf, word_count(trainf)) word_freq = word_count(testf, word_count(trainf))
...@@ -77,9 +77,9 @@ def build_dict(min_word_freq=50): ...@@ -77,9 +77,9 @@ def build_dict(min_word_freq=50):
def reader_creator(filename, word_idx, n, data_type): def reader_creator(filename, word_idx, n, data_type):
def reader(): def reader():
with tarfile.open( with tarfile.open(
paddle.v2.dataset.common.download( paddle.dataset.common.download(
paddle.v2.dataset.imikolov.URL, 'imikolov', paddle.dataset.imikolov.URL, 'imikolov',
paddle.v2.dataset.imikolov.MD5)) as tf: paddle.dataset.imikolov.MD5)) as tf:
f = tf.extractfile(filename) f = tf.extractfile(filename)
UNK = word_idx['<unk>'] UNK = word_idx['<unk>']
...@@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM): ...@@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM):
def fetch(): def fetch():
paddle.v2.dataset.common.download(URL, "imikolov", MD5) paddle.dataset.common.download(URL, "imikolov", MD5)
def convert(path): def convert(path):
...@@ -154,8 +154,7 @@ def convert(path): ...@@ -154,8 +154,7 @@ def convert(path):
""" """
N = 5 N = 5
word_dict = build_dict() word_dict = build_dict()
paddle.v2.dataset.common.convert(path, paddle.dataset.common.convert(path,
train(word_dict, N), 1000, train(word_dict, N), 1000, "imikolov_train")
"imikolov_train") paddle.dataset.common.convert(path,
paddle.v2.dataset.common.convert(path,
test(word_dict, N), 1000, "imikolov_test") test(word_dict, N), 1000, "imikolov_test")
...@@ -17,7 +17,7 @@ MNIST dataset. ...@@ -17,7 +17,7 @@ MNIST dataset.
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
parse training set and test set into paddle reader creators. parse training set and test set into paddle reader creators.
""" """
import paddle.v2.dataset.common import paddle.dataset.common
import subprocess import subprocess
import numpy import numpy
import platform import platform
...@@ -85,9 +85,9 @@ def train(): ...@@ -85,9 +85,9 @@ def train():
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
TRAIN_IMAGE_MD5), TRAIN_IMAGE_MD5),
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
TRAIN_LABEL_MD5), 100) TRAIN_LABEL_MD5), 100)
...@@ -102,22 +102,21 @@ def test(): ...@@ -102,22 +102,21 @@ def test():
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5),
TEST_IMAGE_MD5), paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5),
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', 100)
TEST_LABEL_MD5), 100)
def fetch(): def fetch():
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
def convert(path): def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") paddle.dataset.common.convert(path, train(), 1000, "minist_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") paddle.dataset.common.convert(path, test(), 1000, "minist_test")
...@@ -23,7 +23,7 @@ set and test set into paddle reader creators. ...@@ -23,7 +23,7 @@ set and test set into paddle reader creators.
""" """
import zipfile import zipfile
import paddle.v2.dataset.common import paddle.dataset.common
import re import re
import random import random
import functools import functools
...@@ -100,7 +100,7 @@ USER_INFO = None ...@@ -100,7 +100,7 @@ USER_INFO = None
def __initialize_meta_info__(): def __initialize_meta_info__():
fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) fn = paddle.dataset.common.download(URL, "movielens", MD5)
global MOVIE_INFO global MOVIE_INFO
if MOVIE_INFO is None: if MOVIE_INFO is None:
pattern = re.compile(r'^(.*)\((\d+)\)$') pattern = re.compile(r'^(.*)\((\d+)\)$')
...@@ -247,15 +247,15 @@ def unittest(): ...@@ -247,15 +247,15 @@ def unittest():
def fetch(): def fetch():
paddle.v2.dataset.common.download(URL, "movielens", MD5) paddle.dataset.common.download(URL, "movielens", MD5)
def convert(path): def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -26,7 +26,7 @@ from itertools import chain ...@@ -26,7 +26,7 @@ from itertools import chain
import nltk import nltk
from nltk.corpus import movie_reviews from nltk.corpus import movie_reviews
import paddle.v2.dataset.common import paddle.dataset.common
__all__ = ['train', 'test', 'get_word_dict', 'convert'] __all__ = ['train', 'test', 'get_word_dict', 'convert']
NUM_TRAINING_INSTANCES = 1600 NUM_TRAINING_INSTANCES = 1600
...@@ -39,13 +39,13 @@ def download_data_if_not_yet(): ...@@ -39,13 +39,13 @@ def download_data_if_not_yet():
""" """
try: try:
# make sure that nltk can find the data # make sure that nltk can find the data
if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: if paddle.dataset.common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) nltk.data.path.append(paddle.dataset.common.DATA_HOME)
movie_reviews.categories() movie_reviews.categories()
except LookupError: except LookupError:
print "Downloading movie_reviews data set, please wait....." print "Downloading movie_reviews data set, please wait....."
nltk.download( nltk.download(
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) 'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
print "Download data set success....." print "Download data set success....."
print "Path is " + nltk.data.find('corpora/movie_reviews').path print "Path is " + nltk.data.find('corpora/movie_reviews').path
...@@ -129,13 +129,12 @@ def test(): ...@@ -129,13 +129,12 @@ def test():
def fetch(): def fetch():
nltk.download( nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
def convert(path): def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
py_test(test_image SRCS test_image.py)
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.cifar import paddle.dataset.cifar
import unittest import unittest
...@@ -29,25 +29,25 @@ class TestCIFAR(unittest.TestCase): ...@@ -29,25 +29,25 @@ class TestCIFAR(unittest.TestCase):
def test_test10(self): def test_test10(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.test10()) paddle.dataset.cifar.test10())
self.assertEqual(instances, 10000) self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 9) self.assertEqual(max_label_value, 9)
def test_train10(self): def test_train10(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.train10()) paddle.dataset.cifar.train10())
self.assertEqual(instances, 50000) self.assertEqual(instances, 50000)
self.assertEqual(max_label_value, 9) self.assertEqual(max_label_value, 9)
def test_test100(self): def test_test100(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.test100()) paddle.dataset.cifar.test100())
self.assertEqual(instances, 10000) self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 99) self.assertEqual(max_label_value, 99)
def test_train100(self): def test_train100(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.train100()) paddle.dataset.cifar.train100())
self.assertEqual(instances, 50000) self.assertEqual(instances, 50000)
self.assertEqual(max_label_value, 99) self.assertEqual(max_label_value, 99)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.common import paddle.dataset.common
import unittest import unittest
import tempfile import tempfile
import glob import glob
...@@ -24,14 +24,14 @@ class TestCommon(unittest.TestCase): ...@@ -24,14 +24,14 @@ class TestCommon(unittest.TestCase):
with open(temp_path, 'w') as f: with open(temp_path, 'w') as f:
f.write("Hello\n") f.write("Hello\n")
self.assertEqual('09f7e02f1290be211da707a266f153b3', self.assertEqual('09f7e02f1290be211da707a266f153b3',
paddle.v2.dataset.common.md5file(temp_path)) paddle.dataset.common.md5file(temp_path))
def test_download(self): def test_download(self):
yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460' yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
self.assertEqual( self.assertEqual(
paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
paddle.v2.dataset.common.download( paddle.dataset.common.download(yi_avatar, 'test',
yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d')) 'f75287202d6622414c706c36c16f8e0d'))
def test_split(self): def test_split(self):
def test_reader(): def test_reader():
...@@ -42,7 +42,7 @@ class TestCommon(unittest.TestCase): ...@@ -42,7 +42,7 @@ class TestCommon(unittest.TestCase):
return reader return reader
_, temp_path = tempfile.mkstemp() _, temp_path = tempfile.mkstemp()
paddle.v2.dataset.common.split( paddle.dataset.common.split(
test_reader(), 4, suffix=temp_path + '/test-%05d.pickle') test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
files = glob.glob(temp_path + '/test-%05d.pickle') files = glob.glob(temp_path + '/test-%05d.pickle')
self.assertEqual(len(files), 3) self.assertEqual(len(files), 3)
...@@ -52,7 +52,7 @@ class TestCommon(unittest.TestCase): ...@@ -52,7 +52,7 @@ class TestCommon(unittest.TestCase):
for x in xrange(5): for x in xrange(5):
with open(temp_path + '/%05d.test' % x) as f: with open(temp_path + '/%05d.test' % x) as f:
f.write('%d\n' % x) f.write('%d\n' % x)
reader = paddle.v2.dataset.common.cluster_files_reader( reader = paddle.dataset.common.cluster_files_reader(
temp_path + '/*.test', 5, 0) temp_path + '/*.test', 5, 0)
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertEqual(e, str("0")) self.assertEqual(e, str("0"))
...@@ -69,7 +69,7 @@ class TestCommon(unittest.TestCase): ...@@ -69,7 +69,7 @@ class TestCommon(unittest.TestCase):
return reader return reader
path = tempfile.mkdtemp() path = tempfile.mkdtemp()
paddle.v2.dataset.common.convert(path, paddle.dataset.common.convert(path,
test_reader(), num_shards, test_reader(), num_shards,
'random_images') 'random_images')
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.flowers import paddle.dataset.flowers
import unittest import unittest
...@@ -30,19 +30,19 @@ class TestFlowers(unittest.TestCase): ...@@ -30,19 +30,19 @@ class TestFlowers(unittest.TestCase):
def test_train(self): def test_train(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.flowers.train()) paddle.dataset.flowers.train())
self.assertEqual(instances, 6149) self.assertEqual(instances, 6149)
self.assertEqual(max_label_value, 102) self.assertEqual(max_label_value, 102)
def test_test(self): def test_test(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.flowers.test()) paddle.dataset.flowers.test())
self.assertEqual(instances, 1020) self.assertEqual(instances, 1020)
self.assertEqual(max_label_value, 102) self.assertEqual(max_label_value, 102)
def test_valid(self): def test_valid(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.flowers.valid()) paddle.dataset.flowers.valid())
self.assertEqual(instances, 1020) self.assertEqual(instances, 1020)
self.assertEqual(max_label_value, 102) self.assertEqual(max_label_value, 102)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.imdb import paddle.dataset.imdb
import unittest import unittest
import re import re
...@@ -30,15 +30,13 @@ class TestIMDB(unittest.TestCase): ...@@ -30,15 +30,13 @@ class TestIMDB(unittest.TestCase):
def test_build_dict(self): def test_build_dict(self):
if self.word_idx == None: if self.word_idx == None:
self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
150)
self.assertEqual(len(self.word_idx), 7036) self.assertEqual(len(self.word_idx), 7036)
def check_dataset(self, dataset, expected_size): def check_dataset(self, dataset, expected_size):
if self.word_idx == None: if self.word_idx == None:
self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
150)
sum = 0 sum = 0
for l in dataset(self.word_idx): for l in dataset(self.word_idx):
...@@ -47,10 +45,10 @@ class TestIMDB(unittest.TestCase): ...@@ -47,10 +45,10 @@ class TestIMDB(unittest.TestCase):
self.assertEqual(sum, expected_size) self.assertEqual(sum, expected_size)
def test_train(self): def test_train(self):
self.check_dataset(paddle.v2.dataset.imdb.train, 25000) self.check_dataset(paddle.dataset.imdb.train, 25000)
def test_test(self): def test_test(self):
self.check_dataset(paddle.v2.dataset.imdb.test, 25000) self.check_dataset(paddle.dataset.imdb.test, 25000)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.imikolov import paddle.dataset.imikolov
import unittest import unittest
WORD_DICT = paddle.v2.dataset.imikolov.build_dict() WORD_DICT = paddle.dataset.imikolov.build_dict()
class TestMikolov(unittest.TestCase): class TestMikolov(unittest.TestCase):
...@@ -25,7 +25,7 @@ class TestMikolov(unittest.TestCase): ...@@ -25,7 +25,7 @@ class TestMikolov(unittest.TestCase):
def test_train(self): def test_train(self):
n = 5 n = 5
self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n)
first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\ first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\ 'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
...@@ -34,16 +34,16 @@ class TestMikolov(unittest.TestCase): ...@@ -34,16 +34,16 @@ class TestMikolov(unittest.TestCase):
WORD_DICT.get(ch, WORD_DICT['<unk>']) WORD_DICT.get(ch, WORD_DICT['<unk>'])
for ch in first_line.split(' ') for ch in first_line.split(' ')
] ]
for l in paddle.v2.dataset.imikolov.train( for l in paddle.dataset.imikolov.train(
WORD_DICT, n=-1, WORD_DICT, n=-1,
data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): data_type=paddle.dataset.imikolov.DataType.SEQ)():
read_line = l[0][1:] read_line = l[0][1:]
break break
self.assertEqual(first_line, read_line) self.assertEqual(first_line, read_line)
def test_test(self): def test_test(self):
n = 5 n = 5
self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n)
first_line = 'consumers may want to move their telephones a little '\ first_line = 'consumers may want to move their telephones a little '\
'closer to the tv set' 'closer to the tv set'
...@@ -51,9 +51,9 @@ class TestMikolov(unittest.TestCase): ...@@ -51,9 +51,9 @@ class TestMikolov(unittest.TestCase):
WORD_DICT.get(ch, WORD_DICT['<unk>']) WORD_DICT.get(ch, WORD_DICT['<unk>'])
for ch in first_line.split(' ') for ch in first_line.split(' ')
] ]
for l in paddle.v2.dataset.imikolov.test( for l in paddle.dataset.imikolov.test(
WORD_DICT, n=-1, WORD_DICT, n=-1,
data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): data_type=paddle.dataset.imikolov.DataType.SEQ)():
read_line = l[0][1:] read_line = l[0][1:]
break break
self.assertEqual(first_line, read_line) self.assertEqual(first_line, read_line)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.mnist import paddle.dataset.mnist
import unittest import unittest
...@@ -29,13 +29,13 @@ class TestMNIST(unittest.TestCase): ...@@ -29,13 +29,13 @@ class TestMNIST(unittest.TestCase):
def test_train(self): def test_train(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.mnist.train()) paddle.dataset.mnist.train())
self.assertEqual(instances, 60000) self.assertEqual(instances, 60000)
self.assertEqual(max_label_value, 9) self.assertEqual(max_label_value, 9)
def test_test(self): def test_test(self):
instances, max_label_value = self.check_reader( instances, max_label_value = self.check_reader(
paddle.v2.dataset.mnist.test()) paddle.dataset.mnist.test())
self.assertEqual(instances, 10000) self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 9) self.assertEqual(max_label_value, 9)
......
...@@ -12,19 +12,19 @@ ...@@ -12,19 +12,19 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.mq2007 import paddle.dataset.mq2007
import unittest import unittest
class TestMQ2007(unittest.TestCase): class TestMQ2007(unittest.TestCase):
def test_pairwise(self): def test_pairwise(self):
for label, query_left, query_right in paddle.v2.dataset.mq2007.test( for label, query_left, query_right in paddle.dataset.mq2007.test(
format="pairwise"): format="pairwise"):
self.assertEqual(query_left.shape(), (46, )) self.assertEqual(query_left.shape(), (46, ))
self.assertEqual(query_right.shape(), (46, )) self.assertEqual(query_right.shape(), (46, ))
def test_listwise(self): def test_listwise(self):
for label_array, query_array in paddle.v2.dataset.mq2007.test( for label_array, query_array in paddle.dataset.mq2007.test(
format="listwise"): format="listwise"):
self.assertEqual(len(label_array), len(query_array)) self.assertEqual(len(label_array), len(query_array))
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import unittest import unittest
import numpy as np import numpy as np
import paddle.v2.image as image import paddle.dataset.image as image
class Image(unittest.TestCase): class Image(unittest.TestCase):
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
import nltk import nltk
import paddle.v2.dataset.sentiment as st import paddle.dataset.sentiment as st
from nltk.corpus import movie_reviews from nltk.corpus import movie_reviews
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.voc2012 import paddle.dataset.voc2012
import unittest import unittest
...@@ -26,15 +26,15 @@ class TestVOC(unittest.TestCase): ...@@ -26,15 +26,15 @@ class TestVOC(unittest.TestCase):
return sum return sum
def test_train(self): def test_train(self):
count = self.check_reader(paddle.v2.dataset.voc_seg.train()) count = self.check_reader(paddle.dataset.voc_seg.train())
self.assertEqual(count, 2913) self.assertEqual(count, 2913)
def test_test(self): def test_test(self):
count = self.check_reader(paddle.v2.dataset.voc_seg.test()) count = self.check_reader(paddle.dataset.voc_seg.test())
self.assertEqual(count, 1464) self.assertEqual(count, 1464)
def test_val(self): def test_val(self):
count = self.check_reader(paddle.v2.dataset.voc_seg.val()) count = self.check_reader(paddle.dataset.voc_seg.val())
self.assertEqual(count, 1449) self.assertEqual(count, 1449)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.dataset.wmt16 import paddle.dataset.wmt16
import unittest import unittest
...@@ -34,28 +34,28 @@ class TestWMT16(unittest.TestCase): ...@@ -34,28 +34,28 @@ class TestWMT16(unittest.TestCase):
def test_train(self): def test_train(self):
for idx, sample in enumerate( for idx, sample in enumerate(
paddle.v2.dataset.wmt16.train( paddle.dataset.wmt16.train(
src_dict_size=100000, trg_dict_size=100000)()): src_dict_size=100000, trg_dict_size=100000)()):
if idx >= 10: break if idx >= 10: break
self.checkout_one_sample(sample) self.checkout_one_sample(sample)
def test_test(self): def test_test(self):
for idx, sample in enumerate( for idx, sample in enumerate(
paddle.v2.dataset.wmt16.test( paddle.dataset.wmt16.test(
src_dict_size=1000, trg_dict_size=1000)()): src_dict_size=1000, trg_dict_size=1000)()):
if idx >= 10: break if idx >= 10: break
self.checkout_one_sample(sample) self.checkout_one_sample(sample)
def test_val(self): def test_val(self):
for idx, sample in enumerate( for idx, sample in enumerate(
paddle.v2.dataset.wmt16.validation( paddle.dataset.wmt16.validation(
src_dict_size=1000, trg_dict_size=1000)()): src_dict_size=1000, trg_dict_size=1000)()):
if idx >= 10: break if idx >= 10: break
self.checkout_one_sample(sample) self.checkout_one_sample(sample)
def test_get_dict(self): def test_get_dict(self):
dict_size = 1000 dict_size = 1000
word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True) word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True)
self.assertEqual(len(word_dict), dict_size) self.assertEqual(len(word_dict), dict_size)
self.assertEqual(word_dict[0], "<s>") self.assertEqual(word_dict[0], "<s>")
self.assertEqual(word_dict[1], "<e>") self.assertEqual(word_dict[1], "<e>")
......
...@@ -21,8 +21,7 @@ parse training set and test set into paddle reader creators. ...@@ -21,8 +21,7 @@ parse training set and test set into paddle reader creators.
import numpy as np import numpy as np
import os import os
import paddle.v2.dataset.common import paddle.dataset.common
from paddle.v2.parameters import Parameters
__all__ = ['train', 'test'] __all__ = ['train', 'test']
...@@ -85,7 +84,7 @@ def train(): ...@@ -85,7 +84,7 @@ def train():
:rtype: callable :rtype: callable
""" """
global UCI_TRAIN_DATA global UCI_TRAIN_DATA
load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
def reader(): def reader():
for d in UCI_TRAIN_DATA: for d in UCI_TRAIN_DATA:
...@@ -105,7 +104,7 @@ def test(): ...@@ -105,7 +104,7 @@ def test():
:rtype: callable :rtype: callable
""" """
global UCI_TEST_DATA global UCI_TEST_DATA
load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
def reader(): def reader():
for d in UCI_TEST_DATA: for d in UCI_TEST_DATA:
...@@ -114,21 +113,13 @@ def test(): ...@@ -114,21 +113,13 @@ def test():
return reader return reader
def model():
tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
MD5_MODEL)
with open(tar_file, 'r') as f:
parameters = Parameters.from_tar(f)
return parameters
def fetch(): def fetch():
paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) paddle.dataset.common.download(URL, 'uci_housing', MD5)
def convert(path): def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
...@@ -22,8 +22,8 @@ with segmentation has been increased from 7,062 to 9,993. ...@@ -22,8 +22,8 @@ with segmentation has been increased from 7,062 to 9,993.
import tarfile import tarfile
import io import io
import numpy as np import numpy as np
from paddle.v2.dataset.common import download from paddle.dataset.common import download
from paddle.v2.image import * from paddle.dataset.image import *
from PIL import Image from PIL import Image
__all__ = ['train', 'test', 'val'] __all__ = ['train', 'test', 'val']
......
...@@ -22,8 +22,7 @@ parse training set and test set into paddle reader creators. ...@@ -22,8 +22,7 @@ parse training set and test set into paddle reader creators.
import tarfile import tarfile
import gzip import gzip
import paddle.v2.dataset.common import paddle.dataset.common
from paddle.v2.parameters import Parameters
__all__ = [ __all__ = [
'train', 'train',
...@@ -123,7 +122,7 @@ def train(dict_size): ...@@ -123,7 +122,7 @@ def train(dict_size):
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'train/train', dict_size) 'train/train', dict_size)
...@@ -139,27 +138,20 @@ def test(dict_size): ...@@ -139,27 +138,20 @@ def test(dict_size):
:rtype: callable :rtype: callable
""" """
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'test/test', dict_size) 'test/test', dict_size)
def gen(dict_size): def gen(dict_size):
return reader_creator( return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'gen/gen', dict_size) 'gen/gen', dict_size)
def model():
tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
with gzip.open(tar_file, 'r') as f:
parameters = Parameters.from_tar(f)
return parameters
def get_dict(dict_size, reverse=True): def get_dict(dict_size, reverse=True):
# if reverse = False, return dict = {'a':'001', 'b':'002', ...} # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
# else reverse = true, return dict = {'001':'a', '002':'b', ...} # else reverse = true, return dict = {'001':'a', '002':'b', ...}
tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
src_dict, trg_dict = __read_to_dict(tar_file, dict_size) src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
if reverse: if reverse:
src_dict = {v: k for k, v in src_dict.items()} src_dict = {v: k for k, v in src_dict.items()}
...@@ -168,8 +160,8 @@ def get_dict(dict_size, reverse=True): ...@@ -168,8 +160,8 @@ def get_dict(dict_size, reverse=True):
def fetch(): def fetch():
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
def convert(path): def convert(path):
...@@ -177,6 +169,5 @@ def convert(path): ...@@ -177,6 +169,5 @@ def convert(path):
Converts dataset to recordio format Converts dataset to recordio format
""" """
dict_size = 30000 dict_size = 30000
paddle.v2.dataset.common.convert(path, paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
train(dict_size), 1000, "wmt14_train") paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
...@@ -33,7 +33,7 @@ import tarfile ...@@ -33,7 +33,7 @@ import tarfile
import gzip import gzip
from collections import defaultdict from collections import defaultdict
import paddle.v2.dataset.common import paddle.dataset.common
__all__ = [ __all__ = [
"train", "train",
...@@ -76,7 +76,7 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -76,7 +76,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
def __load_dict(tar_file, dict_size, lang, reverse=False): def __load_dict(tar_file, dict_size, lang, reverse=False):
dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size)) "wmt16/%s_%d.dict" % (lang, dict_size))
if not os.path.exists(dict_path) or ( if not os.path.exists(dict_path) or (
len(open(dict_path, "r").readlines()) != dict_size): len(open(dict_path, "r").readlines()) != dict_size):
...@@ -178,7 +178,7 @@ def train(src_dict_size, trg_dict_size, src_lang="en"): ...@@ -178,7 +178,7 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
src_lang) src_lang)
return reader_creator( return reader_creator(
tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz"), "wmt16.tar.gz"),
file_name="wmt16/train", file_name="wmt16/train",
src_dict_size=src_dict_size, src_dict_size=src_dict_size,
...@@ -227,7 +227,7 @@ def test(src_dict_size, trg_dict_size, src_lang="en"): ...@@ -227,7 +227,7 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
src_lang) src_lang)
return reader_creator( return reader_creator(
tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz"), "wmt16.tar.gz"),
file_name="wmt16/test", file_name="wmt16/test",
src_dict_size=src_dict_size, src_dict_size=src_dict_size,
...@@ -274,7 +274,7 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"): ...@@ -274,7 +274,7 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
src_lang) src_lang)
return reader_creator( return reader_creator(
tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz"), "wmt16.tar.gz"),
file_name="wmt16/val", file_name="wmt16/val",
src_dict_size=src_dict_size, src_dict_size=src_dict_size,
...@@ -303,12 +303,12 @@ def get_dict(lang, dict_size, reverse=False): ...@@ -303,12 +303,12 @@ def get_dict(lang, dict_size, reverse=False):
if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
else: dict_size = min(dict_size, TOTAL_DE_WORDS) else: dict_size = min(dict_size, TOTAL_DE_WORDS)
dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size)) "wmt16/%s_%d.dict" % (lang, dict_size))
assert os.path.exists(dict_path), "Word dictionary does not exist. " assert os.path.exists(dict_path), "Word dictionary does not exist. "
"Please invoke paddle.dataset.wmt16.train/test/validation first " "Please invoke paddle.dataset.wmt16.train/test/validation first "
"to build the dictionary." "to build the dictionary."
tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz") tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
return __load_dict(tar_file, dict_size, lang, reverse) return __load_dict(tar_file, dict_size, lang, reverse)
...@@ -323,7 +323,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang): ...@@ -323,7 +323,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang):
"""Converts dataset to recordio format. """Converts dataset to recordio format.
""" """
paddle.v2.dataset.common.convert( paddle.dataset.common.convert(
path, path,
train( train(
src_dict_size=src_dict_size, src_dict_size=src_dict_size,
...@@ -331,7 +331,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang): ...@@ -331,7 +331,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang):
src_lang=src_lang), src_lang=src_lang),
1000, 1000,
"wmt16_train") "wmt16_train")
paddle.v2.dataset.common.convert( paddle.dataset.common.convert(
path, path,
test( test(
src_dict_size=src_dict_size, src_dict_size=src_dict_size,
...@@ -339,7 +339,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang): ...@@ -339,7 +339,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang):
src_lang=src_lang), src_lang=src_lang),
1000, 1000,
"wmt16_test") "wmt16_test")
paddle.v2.dataset.common.convert( paddle.dataset.common.convert(
path, path,
validation( validation(
src_dict_size=src_dict_size, src_dict_size=src_dict_size,
......
...@@ -41,6 +41,7 @@ from memory_optimization_transpiler import memory_optimize, release_memory ...@@ -41,6 +41,7 @@ from memory_optimization_transpiler import memory_optimize, release_memory
import profiler import profiler
import unique_name import unique_name
import recordio_writer import recordio_writer
from parallel_executor import ParallelExecutor
Tensor = LoDTensor Tensor = LoDTensor
...@@ -68,6 +69,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ ...@@ -68,6 +69,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
'profiler', 'profiler',
'unique_name', 'unique_name',
'recordio_writer', 'recordio_writer',
'ParallelExecutor',
] ]
......
...@@ -82,11 +82,14 @@ class SelectCase(object): ...@@ -82,11 +82,14 @@ class SelectCase(object):
RECEIVE = 2 RECEIVE = 2
def __init__(self, def __init__(self,
select,
case_idx, case_idx,
case_to_execute, case_to_execute,
channel_action_fn=None, channel_action_fn=None,
channel=None, channel=None,
value=None): value=None,
is_copy=False):
self.select = select
self.helper = LayerHelper('conditional_block') self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program self.main_program = self.helper.main_program
self.is_scalar_condition = True self.is_scalar_condition = True
...@@ -99,7 +102,24 @@ class SelectCase(object): ...@@ -99,7 +102,24 @@ class SelectCase(object):
self.action = (self.SEND self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else self.DEFAULT self.RECEIVE) if channel_action_fn else self.DEFAULT
self.value = value
X = value
if self.action == self.SEND and is_copy:
# We create of copy of the data we want to send
copied_X = self.select.parent_block.create_var(
name=unique_name.generate(value.name + '_copy'),
type=value.type,
dtype=value.dtype,
shape=value.shape,
lod_level=value.lod_level,
capacity=value.capacity
if hasattr(value, 'capacity') else None, )
self.select.parent_block.append_op(
type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X
self.value = X
self.channel = channel self.channel = channel
def __enter__(self): def __enter__(self):
...@@ -173,6 +193,7 @@ class SelectCase(object): ...@@ -173,6 +193,7 @@ class SelectCase(object):
class Select(BlockGuard): class Select(BlockGuard):
def __init__(self, name=None): def __init__(self, name=None):
self.helper = LayerHelper('select', name=name) self.helper = LayerHelper('select', name=name)
self.parent_block = self.helper.main_program.current_block()
self.cases = [] self.cases = []
super(Select, self).__init__(self.helper.main_program) super(Select, self).__init__(self.helper.main_program)
...@@ -183,12 +204,12 @@ class Select(BlockGuard): ...@@ -183,12 +204,12 @@ class Select(BlockGuard):
super(Select, self).__enter__() super(Select, self).__enter__()
return self return self
def case(self, channel_action_fn, channel, value): def case(self, channel_action_fn, channel, value, is_copy=False):
"""Create a new block for this condition. """Create a new block for this condition.
""" """
select_case = SelectCase( select_case = SelectCase(self,
len(self.cases), self.case_to_execute, channel_action_fn, channel, len(self.cases), self.case_to_execute,
value) channel_action_fn, channel, value, is_copy)
self.cases.append(select_case) self.cases.append(select_case)
...@@ -197,7 +218,7 @@ class Select(BlockGuard): ...@@ -197,7 +218,7 @@ class Select(BlockGuard):
def default(self): def default(self):
"""Create a default case block for this condition. """Create a default case block for this condition.
""" """
default_case = SelectCase(len(self.cases), self.case_to_execute) default_case = SelectCase(self, len(self.cases), self.case_to_execute)
self.cases.append(default_case) self.cases.append(default_case)
...@@ -339,35 +360,26 @@ def channel_send(channel, value, is_copy=False): ...@@ -339,35 +360,26 @@ def channel_send(channel, value, is_copy=False):
main_program = helper.main_program main_program = helper.main_program
channel_send_block = main_program.current_block() channel_send_block = main_program.current_block()
status = helper.create_variable(
name=unique_name.generate('status'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype=core.VarDesc.VarType.BOOL)
X = value X = value
if is_copy is True: if is_copy:
copied_X = helper.create_variable( copied_X = helper.create_variable(
name=unique_name.generate(value.name + '_copy'), name=unique_name.generate(value.name + '_copy'),
type=value.type, type=value.type,
dtype=value.dtype, dtype=value.dtype,
shape=value.shape, shape=value.shape,
lod_level=value.lod_level, lod_level=value.lod_level,
capacity=value.capacity) capacity=value.capacity if hasattr(value, 'capacity') else None)
assign_op = channel_send_block.append_op( assign_op = channel_send_block.append_op(
type="assign_op", inputs={"X": value}, outputs={"Out": copied_X}) type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X X = copied_X
channel_send_op = channel_send_block.append_op( channel_send_block.append_op(
type="channel_send", type="channel_send", inputs={
inputs={
"Channel": channel, "Channel": channel,
"X": X, "X": X,
}, })
outputs={"Status": status})
return status
def channel_recv(channel, return_value): def channel_recv(channel, return_value):
......
...@@ -338,15 +338,24 @@ class DistributeTranspiler: ...@@ -338,15 +338,24 @@ class DistributeTranspiler:
else: else:
self._append_pserver_non_opt_ops(block, op) self._append_pserver_non_opt_ops(block, op)
append_block = optimize_block
# append lr decay ops to the child block if exits
lr_ops = self._get_lr_ops()
if len(lr_ops) > 0:
for _, op in enumerate(lr_ops):
self._append_pserver_non_opt_ops(append_block, op)
append_block = pserver_program.create_block(append_block.idx)
# append op to the current block # append op to the current block
per_opt_block = optimize_block per_opt_block = append_block
for _, opt_op in enumerate(opt_op_on_pserver): for _, opt_op in enumerate(opt_op_on_pserver):
for _, op in enumerate(self.optimize_ops): for _, op in enumerate(self.optimize_ops):
# optimizer is connected to itself # optimizer is connected to itself
if ufind.is_connected(op, opt_op) and \ if ufind.is_connected(op, opt_op) and \
op not in global_ops: op not in global_ops:
__append_optimize_op__(op, per_opt_block) __append_optimize_op__(op, per_opt_block)
per_opt_block = pserver_program.create_block(0) per_opt_block = pserver_program.create_block(append_block.idx)
# append global ops # append global ops
for glb_op in global_ops: for glb_op in global_ops:
...@@ -786,3 +795,33 @@ class DistributeTranspiler: ...@@ -786,3 +795,33 @@ class DistributeTranspiler:
else: else:
iomap[key] = vars iomap[key] = vars
return iomap return iomap
def _get_lr_ops(self):
lr_ops = []
# find learning rate variables by optimize op
lr_vars = set()
for op in self.optimize_ops:
if self._is_opt_op(op):
lr_vars.add(op.input("LearningRate")[0])
find_ops = []
# find ops which output is lr var
block = self.program.global_block()
for op in block.ops:
if set(op.output_arg_names) & lr_vars:
find_ops.append(op)
# make a union find struct by the ops in default_main_program
ufind = UnionFind(block.ops)
for op1 in block.ops:
for op2 in block.ops:
# NOTE: we need to skip all optimize ops, since it is connected
# with forward/backward ops and lr ops, we only need the lr ops.
if op1 != op2 and self._is_op_connected(op1, op2) and \
not self._is_opt_op(op1) and not self._is_opt_op(op2):
ufind.union(op1, op2)
# find all ops which is related with lr var
for op1 in block.ops:
for op2 in find_ops:
if ufind.is_connected(op1, op2):
lr_ops.append(op1)
return lr_ops
...@@ -403,6 +403,8 @@ class LayerHelper(object): ...@@ -403,6 +403,8 @@ class LayerHelper(object):
if 'use_mkldnn' in self.kwargs: if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
act_type = act.pop('type') act_type = act.pop('type')
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
self.append_op( self.append_op(
type=act_type, type=act_type,
inputs={"X": [input_var]}, inputs={"X": [input_var]},
......
...@@ -18,6 +18,7 @@ from tensor import assign, fill_constant ...@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
from .. import core from .. import core
from ..framework import Program, Variable, Operator from ..framework import Program, Variable, Operator
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ..initializer import force_init_on_cpu
from ops import logical_and, logical_not, logical_or from ops import logical_and, logical_not, logical_or
__all__ = [ __all__ = [
...@@ -949,7 +950,7 @@ def create_array(dtype): ...@@ -949,7 +950,7 @@ def create_array(dtype):
dtype=dtype) dtype=dtype)
def less_than(x, y, cond=None, **ignored): def less_than(x, y, force_cpu=True, cond=None, **ignored):
""" """
**Less than** **Less than**
...@@ -958,6 +959,7 @@ def less_than(x, y, cond=None, **ignored): ...@@ -958,6 +959,7 @@ def less_than(x, y, cond=None, **ignored):
Args: Args:
x(Variable): First operand of *less_than* x(Variable): First operand of *less_than*
y(Variable): Second operand of *less_than* y(Variable): Second operand of *less_than*
force_cpu(Bool|True): The output data will be on CPU if set true.
cond(Variable|None): Optional output variable to store the result of *less_than* cond(Variable|None): Optional output variable to store the result of *less_than*
Returns: Returns:
...@@ -974,8 +976,11 @@ def less_than(x, y, cond=None, **ignored): ...@@ -974,8 +976,11 @@ def less_than(x, y, cond=None, **ignored):
cond.stop_gradient = True cond.stop_gradient = True
helper.append_op( helper.append_op(
type='less_than', inputs={'X': [x], type='less_than',
'Y': [y]}, outputs={'Out': [cond]}) inputs={'X': [x],
'Y': [y]},
outputs={'Out': [cond]},
attrs={'force_cpu': force_cpu or force_init_on_cpu()})
return cond return cond
...@@ -1396,7 +1401,8 @@ class DynamicRNN(object): ...@@ -1396,7 +1401,8 @@ class DynamicRNN(object):
type='less_than', type='less_than',
inputs={'X': self.step_idx, inputs={'X': self.step_idx,
'Y': self.max_seq_len}, 'Y': self.max_seq_len},
outputs={'Out': self.cond}) outputs={'Out': self.cond},
attrs={'force_cpu': True})
input_array = parent_block.create_var( input_array = parent_block.create_var(
name=unique_name.generate('dynamic_rnn_input_array'), name=unique_name.generate('dynamic_rnn_input_array'),
...@@ -1445,7 +1451,11 @@ class DynamicRNN(object): ...@@ -1445,7 +1451,11 @@ class DynamicRNN(object):
for new_mem, mem_array in self.mem_link: for new_mem, mem_array in self.mem_link:
array_write(x=new_mem, i=self.step_idx, array=mem_array) array_write(x=new_mem, i=self.step_idx, array=mem_array)
less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond) less_than(
x=self.step_idx,
y=self.max_seq_len,
force_cpu=True,
cond=self.cond)
self.status = DynamicRNN.AFTER_RNN self.status = DynamicRNN.AFTER_RNN
for each_array in self.output_array: for each_array in self.output_array:
......
...@@ -134,6 +134,7 @@ def detection_output(loc, ...@@ -134,6 +134,7 @@ def detection_output(loc,
scores = nn.softmax(input=scores) scores = nn.softmax(input=scores)
scores = ops.reshape(x=scores, shape=old_shape) scores = ops.reshape(x=scores, shape=old_shape)
scores = nn.transpose(scores, perm=[0, 2, 1]) scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
helper.append_op( helper.append_op(
type="multiclass_nms", type="multiclass_nms",
...@@ -148,6 +149,7 @@ def detection_output(loc, ...@@ -148,6 +149,7 @@ def detection_output(loc,
'score_threshold': score_threshold, 'score_threshold': score_threshold,
'nms_eta': 1.0 'nms_eta': 1.0
}) })
nmsed_outs.stop_gradient = True
return nmsed_outs return nmsed_outs
...@@ -837,4 +839,6 @@ def multi_box_head(inputs, ...@@ -837,4 +839,6 @@ def multi_box_head(inputs,
mbox_locs_concat = tensor.concat(mbox_locs, axis=1) mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
mbox_confs_concat = tensor.concat(mbox_confs, axis=1) mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
box.stop_gradient = True
var.stop_gradient = True
return mbox_locs_concat, mbox_confs_concat, box, var return mbox_locs_concat, mbox_confs_concat, box, var
...@@ -113,9 +113,9 @@ class ListenAndServ(object): ...@@ -113,9 +113,9 @@ class ListenAndServ(object):
which can receive variables from clients and run a block. which can receive variables from clients and run a block.
""" """
def __init__(self, endpoint, fan_in=1, optimizer_mode=True): def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
self.helper = LayerHelper("listen_and_serv") self.helper = LayerHelper("listen_and_serv")
self.inputs = [] self.inputs = inputs
self.outputs = [] self.outputs = []
self.endpoint = endpoint self.endpoint = endpoint
self.fan_in = fan_in self.fan_in = fan_in
...@@ -160,18 +160,13 @@ class ListenAndServ(object): ...@@ -160,18 +160,13 @@ class ListenAndServ(object):
current_block = main_program.current_block() current_block = main_program.current_block()
parent_block = self.parent_block() parent_block = self.parent_block()
params, grads = self.get_params_and_grads()
param_names = [p.name for p in params]
grad_names = [g.name for g in grads]
parent_block.append_op( parent_block.append_op(
type='listen_and_serv', type='listen_and_serv',
inputs={}, inputs={"X": self.inputs},
outputs={}, outputs={},
attrs={ attrs={
'endpoint': self.endpoint, 'endpoint': self.endpoint,
'Fanin': self.fan_in, 'Fanin': self.fan_in,
'ParamList': param_names,
'GradList': grad_names,
'OptimizeBlock': current_block 'OptimizeBlock': current_block
}) })
...@@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars): ...@@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars):
endpoints = list(set(epmap)) endpoints = list(set(epmap))
helper = LayerHelper("Send", **locals()) helper = LayerHelper("Send", **locals())
rpc_client_var = default_main_program().global_block().create_var(
name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
helper.append_op( helper.append_op(
type="send", type="send",
inputs={"X": send_vars}, inputs={"X": send_vars},
outputs={"Out": get_vars}, outputs={"Out": get_vars,
"RPCClient": rpc_client_var},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
......
...@@ -74,6 +74,7 @@ __all__ = [ ...@@ -74,6 +74,7 @@ __all__ = [
'one_hot', 'one_hot',
'autoincreased_step_counter', 'autoincreased_step_counter',
'lod_reset', 'lod_reset',
'lrn',
] ]
...@@ -1482,6 +1483,7 @@ def batch_norm(input, ...@@ -1482,6 +1483,7 @@ def batch_norm(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
data_layout='NCHW', data_layout='NCHW',
in_place=False,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None): moving_variance_name=None):
...@@ -1537,7 +1539,7 @@ def batch_norm(input, ...@@ -1537,7 +1539,7 @@ def batch_norm(input,
saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
batch_norm_out = helper.create_tmp_variable(dtype) batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type="batch_norm", type="batch_norm",
...@@ -3410,3 +3412,73 @@ def lod_reset(x, y=None, target_lod=None): ...@@ -3410,3 +3412,73 @@ def lod_reset(x, y=None, target_lod=None):
raise ValueError("y and target_lod should not be both None.") raise ValueError("y and target_lod should not be both None.")
return out return out
def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
"""
Local Response Normalization Layer. This layer performs a type of
"lateral inhibition" by normalizing over local input regions.
The formula is as follows:
.. math::
Output(i, x, y) = Input(i, x, y) / \left(
k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
(Input(j, x, y))^2 \right)^{\beta}
In the above equation:
* :math:`n`: The number of channels to sum over.
* :math:`k`: The offset (avoid being divided by 0).
* :math:`alpha`: The scaling parameter.
* :math:`beta`: The exponent parameter.
Refer to `ImageNet Classification with Deep Convolutional Neural Networks
<https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
Args:
input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
n (int, default 5): The number of channels to sum over.
k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
alpha (float, default 1e-4): The scaling parameter.
beta (float, default 0.75): The exponent.
name (str, default None): A name for this operation.
Raises:
ValueError: If rank of the input tensor is not 4.
Returns:
A tensor variable storing the transformation result.
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
lrn = fluid.layers.lrn(input=data)
"""
helper = LayerHelper('lrn', **locals())
dtype = helper.input_dtype()
input_shape = input.shape
dims = len(input_shape)
if dims != 4:
raise ValueError(
"dims of input must be 4(not %d), and it's order must be NCHW" %
(dims))
mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
lrn_out = helper.create_tmp_variable(dtype)
helper.append_op(
type="lrn",
inputs={"X": input},
outputs={
"Out": lrn_out,
"MidOut": mid_out,
},
attrs={"n": n,
"k": k,
"alpha": alpha,
"beta": beta})
return lrn_out
...@@ -25,6 +25,8 @@ __activations__ = [ ...@@ -25,6 +25,8 @@ __activations__ = [
'abs', 'abs',
'ceil', 'ceil',
'floor', 'floor',
'cos',
'sin',
'round', 'round',
'reciprocal', 'reciprocal',
'log', 'log',
......
...@@ -98,7 +98,7 @@ def img_conv_group(input, ...@@ -98,7 +98,7 @@ def img_conv_group(input,
use_mkldnn=use_mkldnn) use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]: if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act) tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
drop_rate = conv_batchnorm_drop_rate[i] drop_rate = conv_batchnorm_drop_rate[i]
if abs(drop_rate) > 1e-5: if abs(drop_rate) > 1e-5:
tmp = layers.dropout(x=tmp, dropout_prob=drop_rate) tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import core
import multiprocessing
import framework
import executor
__all__ = ['ParallelExecutor']
class ParallelExecutor(object):
def __init__(self, loss_name, use_cuda, num_threads=None):
places = []
if use_cuda:
for i in xrange(core.get_cuda_device_count()):
p = core.Place()
p.set_place(core.CUDAPlace(i))
places.append(p)
else:
for i in xrange(multiprocessing.cpu_count()):
p = core.Place()
p.set_place(core.CPUPlace())
places.append(p)
if num_threads is None:
num_threads = min(len(places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program()
main = framework.default_main_program()
scope = executor.global_scope()
self.executor = core.ParallelExecutor(
num_threads,
True if use_cuda else False, # use_event
places,
set([
p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient
]),
startup.desc,
main.desc,
loss_name,
scope)
self.scope = scope
def run(self, fetch_list):
fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
return [arr[i] for i in range(len(arr))]
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import contextlib import contextlib
import numpy import numpy
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import contextlib import contextlib
import math import math
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import math import math
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.v2.dataset.conll05 as conll05 import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.initializer import init_on_cpu from paddle.fluid.initializer import init_on_cpu
import contextlib import contextlib
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import contextlib import contextlib
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.fluid.layers as pd import paddle.fluid.layers as pd
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import sys import sys
import numpy import numpy
import unittest import unittest
......
...@@ -16,7 +16,7 @@ import math ...@@ -16,7 +16,7 @@ import math
import sys import sys
import os import os
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
......
...@@ -15,7 +15,7 @@ from __future__ import print_function ...@@ -15,7 +15,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import contextlib import contextlib
import math import math
import numpy as np import numpy as np
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import unittest import unittest
import os import os
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import math import math
import sys import sys
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import sys import sys
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import math import math
import sys import sys
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
......
...@@ -19,7 +19,7 @@ import os ...@@ -19,7 +19,7 @@ import os
import matplotlib import matplotlib
import numpy import numpy
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
matplotlib.use('Agg') matplotlib.use('Agg')
......
...@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase): ...@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase):
with while_op.block(): with while_op.block():
result2 = fill_constant( result2 = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
x_to_send_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
# TODO(abhinav): Need to perform copy when doing a channel send.
# Once this is complete, we can remove these lines
assign(input=x, output=x_to_send_tmp)
with fluid.Select() as select: with fluid.Select() as select:
with select.case(fluid.channel_send, channel, with select.case(
x_to_send_tmp): fluid.channel_send, channel, x, is_copy=True):
assign(input=x, output=x_tmp) assign(input=x, output=x_tmp)
assign(input=y, output=x) assign(input=y, output=x)
assign(elementwise_add(x=x_tmp, y=y), output=y) assign(elementwise_add(x=x_tmp, y=y), output=y)
...@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase): ...@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase):
core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64) core.VarDesc.VarType.FP64)
pong_result = self._create_tensor('pong_return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
def ping(ch, message): def ping(ch, message):
message_to_send_tmp = fill_constant( fluid.channel_send(ch, message, is_copy=True)
shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
assign(input=message, output=message_to_send_tmp)
fluid.channel_send(ch, message_to_send_tmp)
def pong(ch1, ch2): def pong(ch1, ch2):
fluid.channel_recv(ch1, ping_result) fluid.channel_recv(ch1, ping_result)
assign(input=ping_result, output=pong_result) fluid.channel_send(ch2, ping_result, is_copy=True)
fluid.channel_send(ch2, pong_result)
pings = fluid.make_channel( pings = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
import sys import sys
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
BATCH_SIZE = 128 BATCH_SIZE = 128
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
BATCH_SIZE = 128 BATCH_SIZE = 128
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import MomentumOptimizer from paddle.fluid.optimizer import MomentumOptimizer
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.v2 as paddle
import unittest import unittest
import numpy as np import numpy as np
......
...@@ -2,3 +2,5 @@ mnist.recordio ...@@ -2,3 +2,5 @@ mnist.recordio
mnist_0.recordio mnist_0.recordio
mnist_1.recordio mnist_1.recordio
mnist_2.recordio mnist_2.recordio
flowers.recordio
wmt16.recordio
...@@ -196,6 +196,34 @@ class TestFloor(OpTest): ...@@ -196,6 +196,34 @@ class TestFloor(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestCos(OpTest):
def setUp(self):
self.op_type = "cos"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
self.inputs = {'X': x}
self.outputs = {'Out': np.cos(self.inputs['X'])}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestSin(OpTest):
def setUp(self):
self.op_type = "sin"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
self.inputs = {'X': x}
self.outputs = {'Out': np.sin(self.inputs['X'])}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestRound(OpTest): class TestRound(OpTest):
def setUp(self): def setUp(self):
self.op_type = "round" self.op_type = "round"
...@@ -506,5 +534,54 @@ class TestSwish(OpTest): ...@@ -506,5 +534,54 @@ class TestSwish(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.008) self.check_grad(['X'], 'Out', max_relative_error=0.008)
#--------------------test MKLDNN--------------------
class TestMKLDNNRelu(TestRelu):
def setUp(self):
super(TestMKLDNNRelu, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanh(TestTanh):
def setUp(self):
super(TestMKLDNNTanh, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrt(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrt, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.sqrt(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbs(TestAbs):
def setUp(self):
super(TestMKLDNNAbs, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x}
self.outputs = {'Out': np.abs(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -20,19 +20,35 @@ from op_test import OpTest ...@@ -20,19 +20,35 @@ from op_test import OpTest
class TestConcatOp(OpTest): class TestConcatOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "concat" self.op_type = "concat"
x0 = np.random.random((2, 1, 4, 5)).astype('float32') self.init_test_data()
x1 = np.random.random((2, 2, 4, 5)).astype('float32') self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
x2 = np.random.random((2, 3, 4, 5)).astype('float32') self.attrs = {'axis': self.axis}
axis = 1 self.outputs = {
self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]} 'Out': np.concatenate(
self.attrs = {'axis': axis} (self.x0, self.x1, self.x2), axis=self.axis)
self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)} }
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['x0'], 'Out') self.check_grad(['x0'], 'Out')
self.check_grad(['x1'], 'Out')
self.check_grad(['x2'], 'Out')
def init_test_data(self):
self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
self.axis = 1
class TestConcatOp2(OpTest):
def init_test_data(self):
self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
self.axis = 1
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import unittest import unittest
import numpy import numpy
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
import paddle.v2 as paddle import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
......
...@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase): ...@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.softmax(hid)) self.assertIsNotNone(layers.softmax(hid))
print(str(program)) print(str(program))
def test_lrn(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
self.assertIsNotNone(layers.lrn(data))
print(str(program))
def test_get_places(self): def test_get_places(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -97,5 +97,24 @@ class TestLRNMKLDNNOp(TestLRNOp): ...@@ -97,5 +97,24 @@ class TestLRNMKLDNNOp(TestLRNOp):
self.check_output(atol=0.002) self.check_output(atol=0.002)
class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
def get_attrs(self):
attrs = TestLRNMKLDNNOp.get_attrs(self)
attrs['is_test'] = True
return attrs
def test_check_grad_normal(self):
def check_raise_is_test():
try:
self.check_grad(['X'], 'Out', max_relative_error=0.01)
except Exception as e:
t = \
"is_test attribute should be set to False in training phase."
if t in str(e):
raise AttributeError
self.assertRaises(AttributeError, check_raise_is_test)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import paddle.v2.dataset.mnist as mnist import paddle.dataset.mnist as mnist
class TestMultipleReader(unittest.TestCase): class TestMultipleReader(unittest.TestCase):
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import paddle.v2.dataset.mnist as mnist import paddle.dataset.mnist as mnist
from shutil import copyfile from shutil import copyfile
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy
import unittest
import paddle.fluid as fluid
import paddle
import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16
def simple_fc_net():
reader = fluid.layers.open_recordio_file(
filename='./mnist.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm():
reader = fluid.layers.open_recordio_file(
filename='./mnist.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(1):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def squeeze_excitation(input, num_channels, reduction_ratio):
# pool = fluid.layers.pool2d(
# input=input, pool_size=0, pool_type='avg', global_pooling=True)
conv = input
shape = conv.shape
reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu')
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
def shortcut(input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out:
if stride == 1:
filter_size = 1
else:
filter_size = 3
return conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
# The number of first 1x1 convolutional channels for each bottleneck build block
# was halved to reduce the compution cost.
conv0 = conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = conv_bn_layer(
input=conv0,
num_filters=num_filters * 2,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt152(batch_size=4):
img = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
conv = conv_bn_layer(
input=img, num_filters=64, filter_size=3, stride=2, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
cardinality = 64
reduction_ratio = 16
depth = [3, 8, 36, 3]
num_filters = [128, 256, 512, 1024]
for block in range(len(depth)):
for i in range(depth[block]):
conv = bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
shape = conv.shape
reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2)
dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
# Classifier layer:
prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
import time
class TestParallelExecutorBase(unittest.TestCase):
def check_network_convergence(self,
method,
memory_opt=True,
iter=10,
batch_size=None):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = method()
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if memory_opt:
fluid.memory_optimize(main)
exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count()
begin = time.time()
first_loss, = exe.run([loss.name])
first_loss = numpy.array(first_loss)
for i in xrange(iter):
exe.run([])
last_loss, = exe.run([loss.name])
end = time.time()
if batch_size is not None:
print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin))
last_loss = numpy.array(last_loss)
print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0])
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=32)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist.recordio', reader, feeder)
def test_simple_fc(self):
self.check_network_convergence(simple_fc_net)
def test_batchnorm_fc(self):
self.check_network_convergence(fc_with_batchnorm)
class TestResnet(TestParallelExecutorBase):
# @classmethod
# def setUpClass(cls):
# # import os
# # if os.path.exists('./flowers.recordio'):
# # return
# with fluid.program_guard(fluid.Program(), fluid.Program()):
# reader = paddle.batch(flowers.train(), batch_size=4)
# feeder = fluid.DataFeeder(
# feed_list=[
# fluid.layers.data(
# name='image', shape=[3, 224, 224]),
# fluid.layers.data(
# name='label', shape=[1], dtype='int64'),
# ],
# place=fluid.CPUPlace())
# fluid.recordio_writer.convert_reader_to_recordio_file(
# "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
def test_resnet(self):
import functools
batch_size = 4
self.check_network_convergence(
functools.partial(
SE_ResNeXt152, batch_size=batch_size),
iter=20,
batch_size=batch_size)
class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size = 10000
# index for <pad> token in source language.
src_pad_idx = src_vocab_size
# size of target word dictionay
trg_vocab_size = 10000
# index for <pad> token in target language.
trg_pad_idx = trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx = 0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length = 50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 1024
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rate used by all dropout layers.
dropout = 0.1
import numpy as np
def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def __pad_batch_data(insts,
pad_idx,
is_target=False,
return_pos=True,
return_attn_bias=True,
return_max_len=True):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
inst_data = np.array(
[inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, 1])]
if return_pos:
inst_pos = np.array([[
pos_i + 1 if w_i != pad_idx else 0
for pos_i, w_i in enumerate(inst)
] for inst in inst_data])
return_list += [inst_pos.astype("int64").reshape([-1, 1])]
if return_attn_bias:
if is_target:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
max_len))
slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
[-1, 1, max_len, max_len])
slf_attn_bias_data = np.tile(slf_attn_bias_data,
[1, n_head, 1, 1]) * [-1e9]
else:
# This is used to avoid attention on paddings.
slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
(max_len - len(inst))
for inst in insts])
slf_attn_bias_data = np.tile(
slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
[1, n_head, max_len, 1])
return_list += [slf_attn_bias_data.astype("float32")]
if return_max_len:
return_list += [max_len]
return return_list if len(return_list) > 1 else return_list[0]
def data_to_tensor(data_list, name_list, input_dict, place):
assert len(data_list) == len(name_list)
for i in range(len(name_list)):
tensor = fluid.LoDTensor()
tensor.set(data_list[i], place)
input_dict[name_list[i]] = tensor
src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
[inst[0] for inst in insts], src_pad_idx, is_target=False)
trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
[inst[1] for inst in insts], trg_pad_idx, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32")
lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
False, False, False)
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
return [
src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
]
import transformer_model
def transformer():
return transformer_model.transformer(
ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer, ModelHyperParams.n_head,
ModelHyperParams.d_key, ModelHyperParams.d_value,
ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
class TestTransformer(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
reader = paddle.batch(
wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=transformer_model.batch_size)
with fluid.recordio_writer.create_recordio_writer(
"./wmt16.recordio") as writer:
for batch in reader():
for tensor in prepare_batch_input(
batch, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
t = fluid.LoDTensor()
t.set(tensor, fluid.CPUPlace())
writer.append_tensor(t)
writer.complete_append_tensor()
@unittest.skip("transformer is buggy in multi gpu")
def test_main(self):
self.check_network_convergence(transformer)
...@@ -186,6 +186,34 @@ class TestBlockDesc(unittest.TestCase): ...@@ -186,6 +186,34 @@ class TestBlockDesc(unittest.TestCase):
all_ops.append(block.op(idx)) all_ops.append(block.op(idx))
self.assertEqual(all_ops, [op0, op1, op2]) self.assertEqual(all_ops, [op0, op1, op2])
def test_remove_op(self):
prog = core.ProgramDesc()
self.assertIsNotNone(prog)
block = prog.block(0)
self.assertIsNotNone(block)
op1 = block.append_op()
op2 = block.append_op()
var1 = block.var("var1")
var2 = block.var("var2")
var3 = block.var("var3")
var4 = block.var("var4")
var5 = block.var("var5")
op1.set_input("X", ["var1", "var2"])
op1.set_output("Y", ["var3", "var4"])
op2.set_input("X", ["var1"])
op2.set_output("Y", ["var4", "var5"])
# remove op1, its input var2 and output var3 will be removed at the same time,
# but its input var1 and output var4 will not be removed since they are used for op2.
block.remove_op(0, 1)
all_ops = []
for idx in xrange(0, block.op_size()):
all_ops.append(block.op(idx))
self.assertEqual(all_ops, [op2])
all_vars = block.all_vars()
self.assertEqual(set(all_vars), {var1, var4, var5})
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle
import paddle.v2.dataset.mnist as mnist import paddle.dataset.mnist as mnist
class TestRecordIO(unittest.TestCase): class TestRecordIO(unittest.TestCase):
......
...@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase): ...@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase):
def init_serv(self, place): def init_serv(self, place):
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
serv = layers.ListenAndServ(
"127.0.0.1:6174", ["X"], optimizer_mode=False)
with serv.do():
x = layers.data( x = layers.data(
shape=[32, 32], shape=[32, 32],
dtype='float32', dtype='float32',
name="X", name="X",
append_batch_size=False) append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block()) fluid.initializer.Constant(value=1.0)(x, main.global_block())
serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
with serv.do():
o = layers.scale(x=x, scale=10.0) o = layers.scale(x=x, scale=10.0)
main.global_block().create_var( main.global_block().create_var(
name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape) name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
......
...@@ -49,6 +49,61 @@ class TestSeqAvgPool(OpTest): ...@@ -49,6 +49,61 @@ class TestSeqAvgPool(OpTest):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
class TestSeqSumPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "SUM"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = sub_x.sum(axis=0)
class TestSeqMaxPool(TestSeqAvgPool):
def set_data(self):
self.op_type = 'sequence_pool'
x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
lod = [[0, 4, 5, 8, 13]]
for i in range(4):
l = lod[0][i + 1] - lod[0][i]
x[lod[0][i] + np.random.randint(l), :] += 2.0
self.inputs = {'X': (x, lod)}
out = np.zeros((4, 23)).astype('float32')
self.outputs = {'Out': out}
return x, lod, out
def compute(self, x, lod, out):
self.attrs = {'pooltype': "MAX"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = np.amax(sub_x, axis=0)
class TestSeqSqrtPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "SQRT"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
len = lod[0][i + 1] - lod[0][i]
out[i] = sub_x.sum(axis=0) / np.sqrt(len)
class TestSeqLastPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "LAST"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = sub_x[-1, :]
class TestSeqFirstPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "FIRST"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = sub_x[0, :]
class TestSeqAvgPool2D(TestSeqAvgPool): class TestSeqAvgPool2D(TestSeqAvgPool):
def set_data(self): def set_data(self):
self.op_type = 'sequence_pool' self.op_type = 'sequence_pool'
...@@ -68,14 +123,6 @@ class TestSeqAvgPool2D(TestSeqAvgPool): ...@@ -68,14 +123,6 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
out[i] = np.reshape(sub_x.mean(axis=0), (3, 17)) out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
class TestSeqSumPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "SUM"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = sub_x.sum(axis=0)
class TestSeqSumPool2D(TestSeqAvgPool2D): class TestSeqSumPool2D(TestSeqAvgPool2D):
def compute(self, x, lod, out): def compute(self, x, lod, out):
self.attrs = {'pooltype': "SUM"} self.attrs = {'pooltype': "SUM"}
...@@ -84,15 +131,6 @@ class TestSeqSumPool2D(TestSeqAvgPool2D): ...@@ -84,15 +131,6 @@ class TestSeqSumPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(sub_x.sum(axis=0), (3, 17)) out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
class TestSeqSqrtPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "SQRT"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
len = lod[0][i + 1] - lod[0][i]
out[i] = sub_x.sum(axis=0) / np.sqrt(len)
class TestSeqSqrtPool2D(TestSeqAvgPool2D): class TestSeqSqrtPool2D(TestSeqAvgPool2D):
def compute(self, x, lod, out): def compute(self, x, lod, out):
self.attrs = {'pooltype': "SQRT"} self.attrs = {'pooltype': "SQRT"}
...@@ -108,28 +146,6 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D): ...@@ -108,28 +146,6 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
self.check_grad(["X"], "Out", max_relative_error=0.06) self.check_grad(["X"], "Out", max_relative_error=0.06)
class TestSeqMaxPool(TestSeqAvgPool):
def set_data(self):
self.op_type = 'sequence_pool'
x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
lod = [[0, 4, 5, 8, 13]]
for i in range(4):
l = lod[0][i + 1] - lod[0][i]
x[lod[0][i] + np.random.randint(l), :] += 2.0
self.inputs = {'X': (x, lod)}
out = np.zeros((4, 23)).astype('float32')
self.outputs = {'Out': out}
return x, lod, out
def compute(self, x, lod, out):
self.attrs = {'pooltype': "MAX"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = np.amax(sub_x, axis=0)
class TestSeqMaxPool2D(TestSeqAvgPool2D): class TestSeqMaxPool2D(TestSeqAvgPool2D):
def set_data(self): def set_data(self):
self.op_type = 'sequence_pool' self.op_type = 'sequence_pool'
...@@ -151,14 +167,6 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): ...@@ -151,14 +167,6 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
class TestSeqLastPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "LAST"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = sub_x[-1, :]
class TestSeqLastPool2D(TestSeqAvgPool2D): class TestSeqLastPool2D(TestSeqAvgPool2D):
def compute(self, x, lod, out): def compute(self, x, lod, out):
self.attrs = {'pooltype': "LAST"} self.attrs = {'pooltype': "LAST"}
...@@ -167,14 +175,6 @@ class TestSeqLastPool2D(TestSeqAvgPool2D): ...@@ -167,14 +175,6 @@ class TestSeqLastPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(sub_x[-1, :], (3, 17)) out[i] = np.reshape(sub_x[-1, :], (3, 17))
class TestSeqFirstPool(TestSeqAvgPool):
def compute(self, x, lod, out):
self.attrs = {'pooltype': "FIRST"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = sub_x[0, :]
class TestSeqFirstPool2D(TestSeqAvgPool2D): class TestSeqFirstPool2D(TestSeqAvgPool2D):
def compute(self, x, lod, out): def compute(self, x, lod, out):
self.attrs = {'pooltype': "FIRST"} self.attrs = {'pooltype': "FIRST"}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestSplitIdsOp(OpTest):
def setUp(self):
self.op_type = "split_ids"
ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
out0 = np.array([[0], [3], [6]]).astype('int64')
out1 = np.array([[]]).astype('int64')
out2 = np.array([[2], [2], [5], [5]]).astype('int64')
self.inputs = {'Ids': ids}
self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
...@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase): ...@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
def test_lod_tensor_gpu_init(self): def test_lod_tensor_gpu_init(self):
if not core.is_compiled_with_cuda(): if not core.is_compiled_with_cuda():
return return
scope = core.Scope()
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
lod_py = [[0, 2, 5], [0, 2, 4, 5]] lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
...@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase): ...@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
self.assertListEqual(lod_py, lod_tensor.lod()) self.assertListEqual(lod_py, lod_tensor.lod())
def test_empty_tensor(self):
place = core.CPUPlace()
scope = core.Scope()
var = scope.var("test_tensor")
tensor = var.get_tensor()
tensor.set_dims([0, 1])
tensor.alloc_float(place)
tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape)
if core.is_compiled_with_cuda():
gpu_place = core.CUDAPlace(0)
tensor.alloc_float(gpu_place)
tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table", )
batch_size = 64
def position_encoding_init(n_position, d_pos_vec):
"""
Generate the initial values for the sinusoid position encoding table.
"""
position_enc = np.array([[
pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
for j in range(d_pos_vec)
] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc.astype("float32")
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_key,
fan_out=n_head * d_key),
bias_attr=False,
num_flatten_dims=2)
k = layers.fc(input=keys,
size=d_key * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_key,
fan_out=n_head * d_key),
bias_attr=False,
num_flatten_dims=2)
v = layers.fc(input=values,
size=d_value * n_head,
param_attr=fluid.initializer.Xavier(
uniform=False,
fan_in=d_model * d_value,
fan_out=n_head * d_value),
bias_attr=False,
num_flatten_dims=2)
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
if n_head == 1:
return x
hidden_size = x.shape[-1]
# FIXME(guosheng): Decouple the program desc with batch_size.
reshaped = layers.reshape(
x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# FIXME(guosheng): Decouple the program desc with batch_size.
return layers.reshape(
x=trans_x,
shape=map(int,
[batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
"""
Scaled Dot-Product Attention
"""
# FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.
# The current implementation of softmax_op only supports 2D tensor,
# consequently it cannot be directly used here.
# If to use the reshape_op, Besides, the shape of product inferred in
# compile-time is not the actual shape in run-time. It cann't be used
# to set the attribute of reshape_op.
# So, here define the softmax for temporary solution.
def __softmax(x, eps=1e-9):
exp_out = layers.exp(x=x)
sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = layers.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False,
num_flatten_dims=2)
return proj_out
def positionwise_feed_forward(x, d_inner_hid, d_hid):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
param_attr=fluid.initializer.Uniform(
low=-(d_hid**-0.5), high=(d_hid**-0.5)),
act="relu")
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.initializer.Uniform(
low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = layers.layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.initializer.Constant(1.),
bias_attr=fluid.initializer.Constant(0.))
elif cmd == "d": # add dropout
if dropout:
out = layers.dropout(out, dropout_prob=dropout, is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def prepare_encoder(src_word,
src_pos,
src_vocab_size,
src_emb_dim,
src_pad_idx,
src_max_len,
dropout=0.,
pos_pad_idx=0,
pos_enc_param_name=None):
"""Add word embeddings and position encodings.
The output tensor has a shape of:
[batch_size, max_src_length_in_batch, d_model].
This module is used at the bottom of the encoder stacks.
"""
src_word_emb = layers.embedding(
src_word,
size=[src_vocab_size, src_emb_dim],
padding_idx=src_pad_idx,
param_attr=fluid.initializer.Normal(0., 1.))
src_pos_enc = layers.embedding(
src_pos,
size=[src_max_len, src_emb_dim],
padding_idx=pos_pad_idx,
param_attr=fluid.ParamAttr(
name=pos_enc_param_name, trainable=False))
enc_input = src_word_emb + src_pos_enc
# FIXME(guosheng): Decouple the program desc with batch_size.
enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
return layers.dropout(
enc_input, dropout_prob=dropout,
is_test=False) if dropout else enc_input
prepare_encoder = partial(
prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
prepare_decoder = partial(
prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate=0.):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(enc_input, enc_input, enc_input,
attn_bias, d_key, d_value, d_model,
n_head, dropout_rate)
attn_output = post_process_layer(enc_input, attn_output, "dan",
dropout_rate)
ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate=0.):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
d_model, d_inner_hid, dropout_rate)
enc_input = enc_output
return enc_output
def decoder_layer(dec_input,
enc_output,
slf_attn_bias,
dec_enc_attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate=0.):
""" The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except
a multi-head attention is added to implement encoder-decoder attention.
"""
slf_attn_output = multi_head_attention(
dec_input,
dec_input,
dec_input,
slf_attn_bias,
d_key,
d_value,
d_model,
n_head,
dropout_rate, )
slf_attn_output = post_process_layer(
dec_input,
slf_attn_output,
"dan", # residual connection + dropout + layer normalization
dropout_rate, )
enc_attn_output = multi_head_attention(
slf_attn_output,
enc_output,
enc_output,
dec_enc_attn_bias,
d_key,
d_value,
d_model,
n_head,
dropout_rate, )
enc_attn_output = post_process_layer(
slf_attn_output,
enc_attn_output,
"dan", # residual connection + dropout + layer normalization
dropout_rate, )
ffd_output = positionwise_feed_forward(
enc_attn_output,
d_inner_hid,
d_model, )
dec_output = post_process_layer(
enc_attn_output,
ffd_output,
"dan", # residual connection + dropout + layer normalization
dropout_rate, )
return dec_output
def decoder(dec_input,
enc_output,
dec_slf_attn_bias,
dec_enc_attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate=0.):
"""
The decoder is composed of a stack of identical decoder_layer layers.
"""
for i in range(n_layer):
dec_output = decoder_layer(
dec_input,
enc_output,
dec_slf_attn_bias,
dec_enc_attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate, )
dec_input = dec_output
return dec_output
def transformer(
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
src_pad_idx,
trg_pad_idx,
pos_pad_idx, ):
file_obj = fluid.layers.open_recordio_file(
filename='./wmt16.recordio',
shapes=[
[batch_size * max_length, 1],
[batch_size * max_length, 1],
[batch_size * max_length, 1],
[batch_size * max_length, 1],
[batch_size, n_head, max_length, max_length],
[batch_size, n_head, max_length, max_length],
[batch_size, n_head, max_length, max_length],
[batch_size * max_length, 1],
[batch_size * max_length, 1],
],
dtypes=[
'int64',
'int64',
'int64',
'int64',
'float32',
'float32',
'float32',
'int64',
'float32',
],
lod_levels=[0] * 9)
src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
file_obj)
enc_input = prepare_encoder(
src_word,
src_pos,
src_vocab_size,
d_model,
src_pad_idx,
max_length,
dropout_rate, )
enc_output = encoder(
enc_input,
src_slf_attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate, )
dec_input = prepare_decoder(
trg_word,
trg_pos,
trg_vocab_size,
d_model,
trg_pad_idx,
max_length,
dropout_rate, )
dec_output = decoder(
dec_input,
enc_output,
trg_slf_attn_bias,
trg_src_attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate, )
# TODO(guosheng): Share the weight matrix between the embedding layers and
# the pre-softmax linear transformation.
predict = layers.reshape(
x=layers.fc(input=dec_output,
size=trg_vocab_size,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False,
num_flatten_dims=2),
shape=[-1, trg_vocab_size],
act="softmax")
cost = layers.cross_entropy(input=predict, label=gold)
weighted_cost = cost * weights
return layers.reduce_sum(weighted_cost)
...@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could ...@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
be used in user program. be used in user program.
""" """
__all__ = ['np_array', 'text_file', "cloud_reader"] __all__ = ['np_array', 'text_file', 'recordio']
def np_array(x): def np_array(x):
...@@ -66,7 +66,7 @@ def recordio(paths, buf_size=100): ...@@ -66,7 +66,7 @@ def recordio(paths, buf_size=100):
""" """
import recordio as rec import recordio as rec
import paddle.v2.reader.decorator as dec import paddle.reader.decorator as dec
import cPickle as pickle import cPickle as pickle
def reader(): def reader():
...@@ -83,48 +83,3 @@ def recordio(paths, buf_size=100): ...@@ -83,48 +83,3 @@ def recordio(paths, buf_size=100):
f.close() f.close()
return dec.buffered(reader, buf_size) return dec.buffered(reader, buf_size)
pass_num = 0
def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
"""
Create a data reader that yield a record one by one from
the paths:
:paths: path of recordio files, can be a string or a string list.
:etcd_endpoints: the endpoints for etcd cluster
:returns: data reader of recordio files.
.. code-block:: python
from paddle.v2.reader.creator import cloud_reader
etcd_endpoints = "http://127.0.0.1:2379"
trainer.train.(
reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
)
"""
import os
import cPickle as pickle
import paddle.v2.master as master
c = master.client(etcd_endpoints, timeout_sec, buf_size)
if isinstance(paths, basestring):
path = [paths]
else:
path = paths
c.set_dataset(path)
def reader():
global pass_num
c.paddle_start_get_records(pass_num)
pass_num += 1
while True:
r, e = c.next_record()
if not r:
if e != -2:
print "get record error: ", e
break
yield pickle.loads(r)
return reader
...@@ -28,14 +28,14 @@ ...@@ -28,14 +28,14 @@
import os import os
import unittest import unittest
import numpy as np import numpy as np
import paddle.v2.reader.creator import paddle.reader.creator
class TestNumpyArray(unittest.TestCase): class TestNumpyArray(unittest.TestCase):
def test_numpy_array(self): def test_numpy_array(self):
l = [[1, 2, 3], [4, 5, 6]] l = [[1, 2, 3], [4, 5, 6]]
x = np.array(l, np.int32) x = np.array(l, np.int32)
reader = paddle.v2.reader.creator.np_array(x) reader = paddle.reader.creator.np_array(x)
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertItemsEqual(e, l[idx]) self.assertItemsEqual(e, l[idx])
...@@ -43,14 +43,14 @@ class TestNumpyArray(unittest.TestCase): ...@@ -43,14 +43,14 @@ class TestNumpyArray(unittest.TestCase):
class TestTextFile(unittest.TestCase): class TestTextFile(unittest.TestCase):
def test_text_file(self): def test_text_file(self):
path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
reader = paddle.v2.reader.creator.text_file(path) reader = paddle.reader.creator.text_file(path)
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
class TestRecordIO(unittest.TestCase): class TestRecordIO(unittest.TestCase):
def do_test(self, path): def do_test(self, path):
reader = paddle.v2.reader.creator.recordio(path) reader = paddle.reader.creator.recordio(path)
idx = 0 idx = 0
for e in reader(): for e in reader():
if idx == 0: if idx == 0:
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import time import time
import unittest import unittest
import paddle.v2.reader import paddle.reader
def reader_creator_10(dur): def reader_creator_10(dur):
...@@ -39,7 +39,7 @@ class TestMap(unittest.TestCase): ...@@ -39,7 +39,7 @@ class TestMap(unittest.TestCase):
yield "h" yield "h"
yield "i" yield "i"
r = paddle.v2.reader.map_readers(tokenize, read) r = paddle.reader.map_readers(tokenize, read)
for i, e in enumerate(r()): for i, e in enumerate(r()):
self.assertEqual(e, i) self.assertEqual(e, i)
...@@ -47,7 +47,7 @@ class TestMap(unittest.TestCase): ...@@ -47,7 +47,7 @@ class TestMap(unittest.TestCase):
class TestBuffered(unittest.TestCase): class TestBuffered(unittest.TestCase):
def test_read(self): def test_read(self):
for size in range(20): for size in range(20):
b = paddle.v2.reader.buffered(reader_creator_10(0), size) b = paddle.reader.buffered(reader_creator_10(0), size)
c = 0 c = 0
for i in b(): for i in b():
self.assertEqual(i, c) self.assertEqual(i, c)
...@@ -56,7 +56,7 @@ class TestBuffered(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestBuffered(unittest.TestCase):
def test_buffering(self): def test_buffering(self):
# read have 30ms delay. # read have 30ms delay.
b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10) b = paddle.reader.buffered(reader_creator_10(0.03), 10)
last_time = time.time() last_time = time.time()
for idx, i in enumerate(b()): for idx, i in enumerate(b()):
elapsed_time = time.time() - last_time elapsed_time = time.time() - last_time
...@@ -70,17 +70,17 @@ class TestBuffered(unittest.TestCase): ...@@ -70,17 +70,17 @@ class TestBuffered(unittest.TestCase):
class TestCompose(unittest.TestCase): class TestCompose(unittest.TestCase):
def test_compse(self): def test_compse(self):
reader = paddle.v2.reader.compose( reader = paddle.reader.compose(
reader_creator_10(0), reader_creator_10(0)) reader_creator_10(0), reader_creator_10(0))
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertEqual(e, (idx, idx)) self.assertEqual(e, (idx, idx))
def test_compose_not_aligned(self): def test_compose_not_aligned(self):
total = 0 total = 0
reader = paddle.v2.reader.compose( reader = paddle.reader.compose(
paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
reader_creator_10(0)) reader_creator_10(0))
with self.assertRaises(paddle.v2.reader.ComposeNotAligned): with self.assertRaises(paddle.reader.ComposeNotAligned):
for e in reader(): for e in reader():
total += 1 total += 1
# expecting 10, not 20 # expecting 10, not 20
...@@ -88,8 +88,8 @@ class TestCompose(unittest.TestCase): ...@@ -88,8 +88,8 @@ class TestCompose(unittest.TestCase):
def test_compose_not_aligned_no_check(self): def test_compose_not_aligned_no_check(self):
total = 0 total = 0
reader = paddle.v2.reader.compose( reader = paddle.reader.compose(
paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
reader_creator_10(0), reader_creator_10(0),
check_alignment=False) check_alignment=False)
for e in reader(): for e in reader():
...@@ -100,7 +100,7 @@ class TestCompose(unittest.TestCase): ...@@ -100,7 +100,7 @@ class TestCompose(unittest.TestCase):
class TestChain(unittest.TestCase): class TestChain(unittest.TestCase):
def test_chain(self): def test_chain(self):
c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)) c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
idx = 0 idx = 0
for e in c(): for e in c():
self.assertEqual(e, idx % 10) self.assertEqual(e, idx % 10)
...@@ -113,7 +113,7 @@ class TestShuffle(unittest.TestCase): ...@@ -113,7 +113,7 @@ class TestShuffle(unittest.TestCase):
case = [(0, True), (1, True), (10, False), (100, False)] case = [(0, True), (1, True), (10, False), (100, False)]
a = reader_creator_10(0) a = reader_creator_10(0)
for size, checkEq in case: for size, checkEq in case:
s = paddle.v2.reader.shuffle(a, size) s = paddle.reader.shuffle(a, size)
total = 0 total = 0
for idx, e in enumerate(s()): for idx, e in enumerate(s()):
if checkEq: if checkEq:
...@@ -133,7 +133,7 @@ class TestXmap(unittest.TestCase): ...@@ -133,7 +133,7 @@ class TestXmap(unittest.TestCase):
for order in orders: for order in orders:
for tNum in thread_nums: for tNum in thread_nums:
for size in buffered_size: for size in buffered_size:
reader = paddle.v2.reader.xmap_readers(mapper, reader = paddle.reader.xmap_readers(mapper,
reader_creator_10(0), reader_creator_10(0),
tNum, size, order) tNum, size, order)
for n in xrange(3): for n in xrange(3):
...@@ -150,7 +150,7 @@ class TestPipeReader(unittest.TestCase): ...@@ -150,7 +150,7 @@ class TestPipeReader(unittest.TestCase):
def test_pipe_reader(self): def test_pipe_reader(self):
def example_reader(myfiles): def example_reader(myfiles):
for f in myfiles: for f in myfiles:
pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128)
for l in pr.get_line(): for l in pr.get_line():
yield l yield l
......
...@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation): ...@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
.. math:: .. math::
P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} } P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
""" """
def __init__(self): def __init__(self):
......
...@@ -22,17 +22,13 @@ import data_type ...@@ -22,17 +22,13 @@ import data_type
import topology import topology
import networks import networks
import evaluator import evaluator
from . import dataset
from . import reader
from . import plot from . import plot
import attr import attr
import op import op
import pooling import pooling
import inference import inference
import networks import networks
import minibatch
import plot import plot
import image
import paddle.trainer.config_parser as cp import paddle.trainer.config_parser as cp
__all__ = [ __all__ = [
...@@ -48,14 +44,11 @@ __all__ = [ ...@@ -48,14 +44,11 @@ __all__ = [
'data_type', 'data_type',
'attr', 'attr',
'pooling', 'pooling',
'dataset',
'reader',
'topology', 'topology',
'networks', 'networks',
'infer', 'infer',
'plot', 'plot',
'evaluator', 'evaluator',
'image',
'master', 'master',
] ]
...@@ -153,4 +146,3 @@ def init(**kwargs): ...@@ -153,4 +146,3 @@ def init(**kwargs):
infer = inference.infer infer = inference.infer
batch = minibatch.batch
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import numpy import numpy
import collections import collections
import topology import topology
import minibatch import paddle
import cPickle import cPickle
__all__ = ['infer', 'Inference'] __all__ = ['infer', 'Inference']
...@@ -80,7 +80,7 @@ class Inference(object): ...@@ -80,7 +80,7 @@ class Inference(object):
for each_sample in input: for each_sample in input:
yield each_sample yield each_sample
reader = minibatch.batch(__reader_impl__, batch_size=batch_size) reader = paddle.batch(__reader_impl__, batch_size=batch_size)
self.__gradient_machine__.start() self.__gradient_machine__.start()
for data_batch in reader(): for data_batch in reader():
......
...@@ -20,7 +20,7 @@ The primary usage shows below. ...@@ -20,7 +20,7 @@ The primary usage shows below.
.. code-block:: python .. code-block:: python
import paddle.v2 as paddle import paddle
img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784)) img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
hidden = paddle.layer.fc(input=img, size=200) hidden = paddle.layer.fc(input=img, size=200)
......
py_test(test_op SRCS test_op.py) py_test(test_op SRCS test_op.py)
py_test(test_image SRCS test_image.py)
py_test(test_layer SRCS test_layer.py) py_test(test_layer SRCS test_layer.py)
py_test(test_topology SRCS test_topology.py) py_test(test_topology SRCS test_topology.py)
py_test(test_rnn_layer SRCS test_rnn_layer.py) py_test(test_rnn_layer SRCS test_rnn_layer.py)
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
import math import math
import paddle.dataset as dataset
import paddle.v2 as paddle import paddle.v2 as paddle
...@@ -40,7 +41,7 @@ def wordemb(inlayer): ...@@ -40,7 +41,7 @@ def wordemb(inlayer):
def train(): def train():
word_dict = paddle.dataset.imikolov.build_dict() word_dict = dataset.imikolov.build_dict()
dict_size = len(word_dict) dict_size = len(word_dict)
# Every layer takes integer value of range [0, dict_size) # Every layer takes integer value of range [0, dict_size)
firstword = paddle.layer.data( firstword = paddle.layer.data(
......
...@@ -63,18 +63,18 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') ...@@ -63,18 +63,18 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
packages=['paddle', packages=['paddle',
'paddle.utils', 'paddle.utils',
'paddle.dataset',
'paddle.reader',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers'] 'paddle.fluid.layers']
if '${WITH_FLUID}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
packages+=['paddle.proto', packages+=['paddle.proto',
'paddle.trainer', 'paddle.trainer',
'paddle.trainer_config_helpers', 'paddle.trainer_config_helpers',
'paddle.v2', 'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader',
'paddle.v2.master', 'paddle.v2.master',
'paddle.v2.plot', 'paddle.v2.plot',
'py_paddle'] 'py_paddle']
...@@ -87,7 +87,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: ...@@ -87,7 +87,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
# the prefix is sys.prefix which should always be usr # the prefix is sys.prefix which should always be usr
paddle_bins = '' paddle_bins = ''
if '${WITH_FLUID}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
paddle_bin_dir = 'opt/paddle/bin' paddle_bin_dir = 'opt/paddle/bin'
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
'${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
...@@ -95,7 +95,7 @@ if '${WITH_FLUID}'== 'OFF': ...@@ -95,7 +95,7 @@ if '${WITH_FLUID}'== 'OFF':
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
package_data={'paddle.fluid': ['core.so']} package_data={'paddle.fluid': ['core.so']}
if '${WITH_FLUID}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
package_data['paddle.v2.master']=['libpaddle_master.so'] package_data['paddle.v2.master']=['libpaddle_master.so']
package_data['py_paddle']=['*.py','_swig_paddle.so'] package_data['py_paddle']=['*.py','_swig_paddle.so']
...@@ -106,7 +106,7 @@ package_dir={ ...@@ -106,7 +106,7 @@ package_dir={
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
} }
if '${WITH_FLUID}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle' package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
......
#!/bin/bash
TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
cpplint $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
exit $TOTAL_ERRORS
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册