提交 fbb75c6b 编写于 作者: C chengduoZH

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/refine_gather_reduce

...@@ -179,6 +179,7 @@ set(EXTERNAL_LIBS ...@@ -179,6 +179,7 @@ set(EXTERNAL_LIBS
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
......
...@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin ...@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# install glide # install glide
RUN curl -s -q https://glide.sh/get | sh RUN curl -s -q https://glide.sh/get | sh
# Install TensorRT
# The unnecessary files has been removed to make the library small. It only contains include and lib now.
RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr
# git credential to skip password typing # git credential to skip password typing
RUN git config --global credential.helper store RUN git config --global credential.helper store
......
...@@ -27,7 +27,7 @@ RUN git config --global credential.helper store ...@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8 # Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip==9.0.3 && \
pip install -U 'protobuf==3.1.0' && \ pip install -U 'protobuf==3.1.0' && \
pip install -U wheel sphinx && \ pip install -U wheel sphinx && \
pip install pre-commit pip install pre-commit
......
...@@ -80,6 +80,16 @@ if(WITH_GPU) ...@@ -80,6 +80,16 @@ if(WITH_GPU)
# Include cuda and cudnn # Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
if(TENSORRT_FOUND)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
elseif(WITH_AMD_GPU) elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP) add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
......
if(NOT WITH_GPU)
return()
endif()
set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
find_path(TENSORRT_INCLUDE_DIR NvInfer.h
PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
$ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
NO_DEFAULT_PATH
)
find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
$ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
NO_DEFAULT_PATH
DOC "Path to TensorRT library.")
if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
set(TENSORRT_FOUND ON)
else()
set(TENSORRT_FOUND OFF)
endif()
if(TENSORRT_FOUND)
file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
"${TENSORRT_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
endif()
...@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL ...@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
add_custom_target(paddle_docs ALL add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn) paddle_fluid_docs paddle_fluid_docs_cn
paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2) add_subdirectory(v2)
add_subdirectory(fluid) add_subdirectory(fluid)
add_subdirectory(mobile)
...@@ -33,3 +33,45 @@ Xavier ...@@ -33,3 +33,45 @@ Xavier
:members: :members:
:noindex: :noindex:
MSRA
------
.. autoclass:: paddle.fluid.initializer.MSRA
:members:
:noindex:
ConstantInitializer
-------------------
.. autoclass:: paddle.fluid.initializer.ConstantInitializer
:members:
:noindex:
UniformInitializer
------------------
.. autoclass:: paddle.fluid.initializer.UniformInitializer
:members:
:noindex:
NormalInitializer
-----------------
.. autoclass:: paddle.fluid.initializer.NormalInitializer
:members:
:noindex:
XavierInitializer
-----------------
.. autoclass:: paddle.fluid.initializer.XavierInitializer
:members:
:noindex:
MSRA
------
MSRAInitializer
-----------------
.. autoclass:: paddle.fluid.initializer.MSRAInitializer
:members:
:noindex:
...@@ -815,3 +815,8 @@ zeros ...@@ -815,3 +815,8 @@ zeros
.. autofunction:: paddle.fluid.layers.zeros .. autofunction:: paddle.fluid.layers.zeros
:noindex: :noindex:
topk
----
.. autofunction:: paddle.fluid.layers.topk
:noindex:
...@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an ` ...@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an `
## Synchronize GPU Kernels ## Synchronize GPU Kernels
The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm: The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
1. `OpHandle` will record `DeviceContext` that it is used. 1. `OpHandle` will record `DeviceContext` that it is used.
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable. 2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
......
## Distributed training overview doc
Currently Paddle Fluid use parameter server architecture to support distributed training.
For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
### Synchronous training
The training process of synchronous training is:
![synchronous distributed training](./src/sync_distributed_training.png)
1. Pserver
1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
1. Trainer
1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
1. Trainer use send_op to send all the split gradients to corresponding parameter server.
1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
1. Pserver
1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
1. Pserver will run the optimization block to optimize the parameters.
1. After optimization, pserver set `barrier_condition_` to 1.
1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
1. Trainer.
1. The trainer uses GetVariable to get all the parameters from pserver.
1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
1. Pserver.
1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
### Asynchronous training
In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
The training process of asynchronous training can be:
![asynchronous distributed training](./src/async_distributed_training.png)
1. Pserver:
1. Each parameter has a queue to receive its gradient from trainers.
1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
1. Trainer:
1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
2. Trainer gets all parameters back from pserver.
### Note:
There are also some conditions that need to consider. For exmaple:
1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
1. If we need a lock between parameter update and parameter fetch.
1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
# Design Doc: Asynchronous Update With Distributed Training
## Background
For the typical synchronous distributed training, some significant steps are as follows:
1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
1. After the PServer node received gradients came from all the Trainers, It will aggregate the
gradient variables for the same parameter into one gradient variable and then apply the aggregated
gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
to update the parameters.
1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
so all the Trainers would get the same parameters.
In the synchronously distributed training, there should be a `Barrier` to synchronise the
parameters after the optimizing stage. The performance of a distributed training job would
depend on the slowest node if there were hundreds or thousands of training nodes in a
Job, the performance of synchronously distributed training might be very poor because of
the slow node. So this design doc would introduce an approach to implement
*asynchronously* distributed training in PaddlePaddle Fluid.
## Design
<img src="./src/async_update.png" width="600"/>
As the figure above, we describe a global view of asynchronously update process and use
the parameter `w1` as an example to introduce the steps:
1. For each gradient variables, they may distribute on different GPU card and aggregate
them while they are all calculated.
1. Split the gradient variable into multiple blocks according to the number of PServer
instances and then send them.
1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
the specified parameter.
1. The trainer will fetch latest parameter from PServer before running forward Op which depends
on the specified parameter.
1. Broadcast the received variable into multiple GPU cards and continue to run the next
mini-batch.
### Trainer
- For the multiple devices distributed training, we need to aggregate the gradient
variables which placed on different devices firstly and then schedule a `SendVars` Operator to
send the gradient variables to the multiple PServer instances.
- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
the forward ops.
- There could be a large number of gradient variables to be sent, so we need to use another
thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
computing thread pool to avoid competitive the thread resources with computing.
### Parameter Server
<img src="./src/async_pserver.png" width="750"/>
- There should be multiple trainer instances want to optimize the same parameter at
the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
variable to process them one by one.
- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
can optimize the respective parameter.
# MPI-enabled PaddlePaddle Design doc
# Background
When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
3. TCP connections can not make full use of RDMA 100Gb devices.
We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
# Change list
* Compile args: Need add compile args to enable MPI support.
* Execute args: Need add execute args to assign when and how to use MPI operations.
* New ops: Need new op ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
* Transpiler optimized: Which can add ```mpi_send_op``` and ```mpi_listenandserve_op``` to the running graph.
* MPI utils package: Need MPI utils package as the low-level API supported.
## Compile args
Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI``` compile args to control MPI to use or not. If the ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
## Execute args
Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer), The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
## New ops
We won't replace all the gRPC requests to MPI requests, the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives, the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
### mpi_send_op
Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
### mpi_listenandserve_op
Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
## Transpiler optimized
**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
if confirm to use MPI, we will modify ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
## MPI utils package
In this package, We will write openMPI low-level API to use MPI.
The API included in this package are:
* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
The detailed flow is below:
![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
if(NOT DEFINED SPHINX_THEME)
set(SPHINX_THEME default)
endif()
if(NOT DEFINED SPHINX_THEME_DIR)
set(SPHINX_THEME_DIR)
endif()
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY)
sphinx_add_target(paddle_mobile_docs
html
${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
"${BINARY_BUILD_DIR_CN}/conf.py"
@ONLY)
sphinx_add_target(paddle_mobile_docs_cn
html
${BINARY_BUILD_DIR_CN}
${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
移动端
=====
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
\ No newline at end of file
Mobile
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_ios_en.md
cross_compiling_for_raspberry_en.md
...@@ -6,7 +6,43 @@ Data Reader Interface ...@@ -6,7 +6,43 @@ Data Reader Interface
DataTypes DataTypes
========= =========
.. automodule:: paddle.v2.data_type .. autofunction:: paddle.v2.data_type.dense_array
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_non_value_slot
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_value_slot
:noindex:
.. autoclass:: paddle.v2.data_type.InputType
:members: :members:
:noindex: :noindex:
......
...@@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init) ...@@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(channel_test SRCS channel_test.cc) # cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
......
...@@ -77,14 +77,9 @@ struct TestBroadcastOpHandle { ...@@ -77,14 +77,9 @@ struct TestBroadcastOpHandle {
local_scopes_[input_scope_idx]->Var("input"); local_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
auto* in_var_handle =
vars_.emplace_back(new VarHandle()); new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get()); vars_.emplace_back(in_var_handle);
in_var_handle->place_ = gpu_list_[input_scope_idx];
in_var_handle->name_ = "input";
in_var_handle->version_ = 1;
in_var_handle->scope_idx_ = input_scope_idx;
in_var_handle->generated_op_ = nullptr;
op_handle_->AddInput(in_var_handle); op_handle_->AddInput(in_var_handle);
// add dummy var // add dummy var
...@@ -96,12 +91,8 @@ struct TestBroadcastOpHandle { ...@@ -96,12 +91,8 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < gpu_list_.size(); ++j) { for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
vars_.emplace_back(new VarHandle()); VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get()); vars_.emplace_back(out_var_handle);
out_var_handle->place_ = gpu_list_[j];
out_var_handle->name_ = "out";
out_var_handle->version_ = 2;
out_var_handle->scope_idx_ = j;
op_handle_->AddOutput(out_var_handle); op_handle_->AddOutput(out_var_handle);
} }
......
...@@ -79,13 +79,8 @@ struct TestGatherOpHandle { ...@@ -79,13 +79,8 @@ struct TestGatherOpHandle {
// add input // add input
for (size_t j = 0; j < gpu_list_.size(); ++j) { for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get(); op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
vars_.emplace_back(new VarHandle()); auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get()); vars_.emplace_back(in_var_handle);
in_var_handle->place_ = gpu_list_[j];
in_var_handle->name_ = "input";
in_var_handle->version_ = 1;
in_var_handle->scope_idx_ = j;
in_var_handle->generated_op_ = nullptr;
op_handle_->AddInput(in_var_handle); op_handle_->AddInput(in_var_handle);
} }
...@@ -97,12 +92,9 @@ struct TestGatherOpHandle { ...@@ -97,12 +92,9 @@ struct TestGatherOpHandle {
op_handle_->AddInput(in_dummy_var_handle); op_handle_->AddInput(in_dummy_var_handle);
// add output // add output
vars_.emplace_back(new VarHandle()); auto* out_var_handle =
VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get()); new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
out_var_handle->place_ = gpu_list_[input_scope_idx]; vars_.emplace_back(out_var_handle);
out_var_handle->name_ = "out";
out_var_handle->version_ = 2;
out_var_handle->scope_idx_ = input_scope_idx;
op_handle_->AddOutput(out_var_handle); op_handle_->AddOutput(out_var_handle);
// add dummy var // add dummy var
......
...@@ -177,13 +177,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -177,13 +177,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto &prev_grad = vars[vars.size() - 1]; auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad.get()); op_handle->AddInput(prev_grad.get());
vars.emplace_back(new VarHandle); auto var = new VarHandle(vars.size() - 1, i, og, p);
auto &var = vars.back(); vars.emplace_back(var);
var->place_ = p; op_handle->AddOutput(var);
var->name_ = og;
var->version_ = vars.size() - 1;
op_handle->AddOutput(var.get());
} }
#else #else
PADDLE_ENFORCE("Not implemented"); PADDLE_ENFORCE("Not implemented");
......
...@@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() { ...@@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *s = local_scopes_[i]; auto *s = local_scopes_[i];
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>(); auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
lod_tensors.emplace_back(lod_tensor); lod_tensors.emplace_back(lod_tensor);
} }
...@@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() { ...@@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
} }
}); });
} else { // Special handle CPU only Operator's gradient. Like CRF } else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = auto &trg = *this->local_scopes_[0]
*this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>(); ->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->Var()
->GetMutable<framework::LoDTensor>();
// Reduce All Tensor to trg in CPU // Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg); ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func); VisitDataType(ToDataType(lod_tensors[0].type()), func);
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &scope = local_scopes_[i]; auto &scope =
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i]; auto &p = places_[i];
auto *var = scope->FindVar(var_name); auto *var = scope.FindVar(var_name);
auto *dev_ctx = dev_ctxes_[p]; auto *dev_ctx = dev_ctxes_[p];
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
......
...@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} ...@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
void ScaleLossGradOpHandle::RunImpl() { void ScaleLossGradOpHandle::RunImpl() {
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_; std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
float *tmp = float *tmp = local_scope.FindVar(var_name)
scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>( ->GetMutable<LoDTensor>()
make_ddim({1}), place_); ->mutable_data<float>(make_ddim({1}), place_);
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
*tmp = coeff_; *tmp = coeff_;
......
...@@ -54,13 +54,8 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ...@@ -54,13 +54,8 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
auto &var_holder = var_holders[each_var_name]; auto &var_holder = var_holders[each_var_name];
VarHandle *var = nullptr; VarHandle *var = nullptr;
if (var_holder.empty()) { if (var_holder.empty()) {
var_holder.emplace_back(new VarHandle); var = new VarHandle(0, place_offset, each_var_name, place);
auto &init_var = var_holder[0]; var_holder.emplace_back(var);
init_var->place_ = place;
init_var->name_ = each_var_name;
init_var->generated_op_ = nullptr;
init_var->version_ = 0;
var = init_var.get();
} else { } else {
var = var_holder.rbegin()->get(); var = var_holder.rbegin()->get();
} }
...@@ -73,12 +68,9 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, ...@@ -73,12 +68,9 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
size_t place_offset) { size_t place_offset) {
auto &vars = graph->vars_[place_offset][each_var_name]; auto &vars = graph->vars_[place_offset][each_var_name];
size_t version = vars.size(); size_t version = vars.size();
vars.emplace_back(new VarHandle()); auto var = new VarHandle(version, place_offset, each_var_name, place);
auto &var = vars.back(); vars.emplace_back(var);
var->version_ = version; op_handle->AddOutput(var);
var->name_ = each_var_name;
var->place_ = place;
op_handle->AddOutput(var.get());
} }
template <typename Callback> template <typename Callback>
......
...@@ -33,13 +33,6 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -33,13 +33,6 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
running_ops_(0), running_ops_(0),
allow_op_delay_(allow_op_delay) {} allow_op_delay_(allow_op_delay) {}
void ThreadedSSAGraphExecutor::RunDelayedOps(
const std::unordered_set<OpHandleBase *> &delayed_ops) {
for (auto op : delayed_ops) {
op->Run(use_event_);
}
}
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
...@@ -51,8 +44,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -51,8 +44,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// together since we currently cannot overlap computation and memcpy streams. // together since we currently cannot overlap computation and memcpy streams.
// Should revisit it if overlapping is available. // Should revisit it if overlapping is available.
std::unordered_set<OpHandleBase *> delayed_ops; std::unordered_set<OpHandleBase *> delayed_ops;
std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
std::unordered_set<VarHandleBase *> delayed_vars;
auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) { auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
pending_vars.insert(&var); pending_vars.insert(&var);
...@@ -122,24 +113,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -122,24 +113,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
InsertPendingOp(*op); InsertPendingOp(*op);
} }
auto run_all_ready_ops = [&] { auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : ready_ops) { for (auto *op : set) {
if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
delayed_ops.insert(op);
delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
ready_vars.Extend(op->outputs_);
continue;
}
running_ops_++; running_ops_++;
RunOp(&ready_vars, op); RunOp(&ready_vars, op);
} }
ready_ops.clear(); set.clear();
}; };
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) { while (!pending_vars.empty()) {
// 1. Run All Ready ops // 1. Run All Ready ops
run_all_ready_ops(); // Keep loop until all vars are ready.
//
// NOTE: DelayedOps have a lower priority. It will be scheduled after all
// ready_ops have been performed.
if (ready_ops.empty() && allow_op_delay_) {
run_all_ops(delayed_ops);
} else {
run_all_ops(ready_ops);
}
// 2. Find ready variable // 2. Find ready variable
bool timeout; bool timeout;
...@@ -160,29 +153,16 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -160,29 +153,16 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto &deps = pending_ops[op]; auto &deps = pending_ops[op];
--deps; --deps;
if (deps == 0) { if (deps == 0) {
if (delayed_vars.find(ready_var) != delayed_vars.end()) { if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
blocked_by_delayed_ops.insert(op); delayed_ops.insert(op);
} else { } else {
ready_ops.insert(op); ready_ops.insert(op);
} }
} }
} }
} }
// When there are no other ops to schedule, schedule buffered delayed
// ops and unblock other ops.
if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
RunDelayedOps(delayed_ops);
delayed_ops.clear();
for (auto *op : blocked_by_delayed_ops) {
ready_ops.insert(op);
}
blocked_by_delayed_ops.clear();
}
// Keep loop until all vars are ready.
} }
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(ready_ops.empty());
PADDLE_ENFORCE(delayed_ops.empty());
PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
// Wait FetchOps. // Wait FetchOps.
if (!fetch_ops.empty()) { if (!fetch_ops.empty()) {
......
...@@ -88,8 +88,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -88,8 +88,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q, void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
details::OpHandleBase *op); details::OpHandleBase *op);
void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
private: private:
std::unique_ptr<::ThreadPool> pool_; std::unique_ptr<::ThreadPool> pool_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -33,10 +34,10 @@ struct VarHandleBase { ...@@ -33,10 +34,10 @@ struct VarHandleBase {
// The operator who generate this variable. nullptr if the variable // The operator who generate this variable. nullptr if the variable
// is a root node. // is a root node.
OpHandleBase *generated_op_; OpHandleBase* generated_op_{nullptr};
// Operators which depend on this variable ready. // Operators which depend on this variable ready.
std::unordered_set<OpHandleBase *> pending_ops_; std::unordered_set<OpHandleBase*> pending_ops_;
}; };
// VarHandle is actually a single version of Runtime Variable. // VarHandle is actually a single version of Runtime Variable.
...@@ -47,6 +48,13 @@ struct VarHandleBase { ...@@ -47,6 +48,13 @@ struct VarHandleBase {
struct VarHandle : public VarHandleBase { struct VarHandle : public VarHandleBase {
std::string DebugString() const override; std::string DebugString() const override;
VarHandle(size_t version, size_t scope_index, std::string name,
platform::Place place)
: version_(version),
scope_idx_(scope_index),
name_(std::move(name)),
place_(std::move(place)) {}
// version field currently is not used, however, just store the version to // version field currently is not used, however, just store the version to
// debug easily. // debug easily.
size_t version_; size_t version_;
...@@ -54,7 +62,7 @@ struct VarHandle : public VarHandleBase { ...@@ -54,7 +62,7 @@ struct VarHandle : public VarHandleBase {
std::string name_; std::string name_;
platform::Place place_; platform::Place place_;
bool operator==(const VarHandle &o) const { bool operator==(const VarHandle& o) const {
return o.generated_op_ == generated_op_ && o.name_ == name_ && return o.generated_op_ == generated_op_ && o.name_ == name_ &&
o.scope_idx_ == scope_idx_; o.scope_idx_ == scope_idx_;
} }
......
...@@ -63,13 +63,14 @@ ParallelExecutor::ParallelExecutor( ...@@ -63,13 +63,14 @@ ParallelExecutor::ParallelExecutor(
// Step 1. Bcast the params to devs. // Step 1. Bcast the params to devs.
// Create local scopes // Create local scopes
if (local_scopes.empty()) { if (local_scopes.empty()) {
for (size_t i = 0; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(member_->global_scope_);
member_->local_scopes_.push_back(&scope->NewScope()); for (size_t i = 1; i < member_->places_.size(); ++i) {
member_->local_scopes_.emplace_back(&scope->NewScope());
} }
} else { } else {
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]); member_->local_scopes_.emplace_back(local_scopes[i]);
} }
} }
...@@ -155,15 +156,13 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -155,15 +156,13 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif #endif
} }
void ParallelExecutor::Run( void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
const std::vector<std::string> &fetch_tensors, const std::string &fetched_var_name) {
const std::string &fetched_var_name,
const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
platform::RecordBlock b(0); platform::RecordBlock b(0);
SplitTensorToPlaces(feed_tensors);
// Create local scopes. // Create local scopes.
for (auto &scope : member_->local_scopes_) { for (auto it = member_->local_scopes_.rbegin();
it != member_->local_scopes_.rend(); ++it) {
auto &scope = *it;
Scope &local_scope = scope->NewScope(); Scope &local_scope = scope->NewScope();
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() = *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
&local_scope; &local_scope;
...@@ -177,7 +176,7 @@ void ParallelExecutor::Run( ...@@ -177,7 +176,7 @@ void ParallelExecutor::Run(
InitializeVariable(scope->Var(std::get<0>(name_type_pair)), InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair)); std::get<1>(name_type_pair));
} else { } else {
InitializeVariable(scope->Var(std::get<0>(name_type_pair)), InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair)); std::get<1>(name_type_pair));
} }
} }
...@@ -195,14 +194,28 @@ void ParallelExecutor::Run( ...@@ -195,14 +194,28 @@ void ParallelExecutor::Run(
auto &local_scope = auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>(); *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
local_scope = nullptr;
} }
} }
void ParallelExecutor::SplitTensorToPlaces( void ParallelExecutor::FeedTensorsIntoLocalScopes(
const std::unordered_map<std::string, LoDTensor> &feed_tensors) { const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
for (auto it : feed_tensors) { PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
for (size_t i = 0; i < tensors.size(); ++i) {
auto &map = tensors[i];
auto *scope = member_->local_scopes_[i];
for (auto &pair : map) {
auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
trg->ShareDataWith(pair.second);
trg->set_lod(pair.second.lod());
}
}
}
void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
const std::unordered_map<std::string, LoDTensor> &tensors) {
for (auto pair : tensors) {
auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
member_->places_.size(), lod_tensors.size(), member_->places_.size(), lod_tensors.size(),
"The number of samples of current batch is less than the count of " "The number of samples of current batch is less than the count of "
...@@ -211,7 +224,7 @@ void ParallelExecutor::SplitTensorToPlaces( ...@@ -211,7 +224,7 @@ void ParallelExecutor::SplitTensorToPlaces(
for (size_t j = 0; j < member_->places_.size(); ++j) { for (size_t j = 0; j < member_->places_.size(); ++j) {
// TODO(panxy0718): Do I need to delete this var? // TODO(panxy0718): Do I need to delete this var?
auto t = auto t =
member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>(); member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
t->ShareDataWith(lod_tensors[j]); t->ShareDataWith(lod_tensors[j]);
t->set_lod(lod_tensors[j].lod()); t->set_lod(lod_tensors[j].lod());
} }
......
...@@ -44,16 +44,22 @@ class ParallelExecutor { ...@@ -44,16 +44,22 @@ class ParallelExecutor {
std::vector<Scope*>& GetLocalScopes(); std::vector<Scope*>& GetLocalScopes();
/**
* Feed tensors to local scopes. The size of tensors should be equal to the
* size of local scopes.
*/
void FeedTensorsIntoLocalScopes(
const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
void FeedAndSplitTensorIntoLocalScopes(
const std::unordered_map<std::string, LoDTensor>& tensors);
void Run(const std::vector<std::string>& fetch_tensors, void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name, const std::string& fetched_var_name);
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const; void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
private: private:
void SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
ParallelExecutorPrivate* member_; ParallelExecutorPrivate* member_;
}; };
......
...@@ -21,4 +21,7 @@ endif() ...@@ -21,4 +21,7 @@ endif()
if(WITH_TESTING) if(WITH_TESTING)
add_subdirectory(tests/book) add_subdirectory(tests/book)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
endif() endif()
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "NvInfer.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "paddle/fluid/platform/dynload/tensorrt.h"
namespace dy = paddle::platform::dynload;
class Logger : public nvinfer1::ILogger {
public:
void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
switch (severity) {
case Severity::kINFO:
LOG(INFO) << msg;
break;
case Severity::kWARNING:
LOG(WARNING) << msg;
break;
case Severity::kINTERNAL_ERROR:
case Severity::kERROR:
LOG(ERROR) << msg;
break;
default:
break;
}
}
};
class ScopedWeights {
public:
ScopedWeights(float value) : value_(value) {
w.type = nvinfer1::DataType::kFLOAT;
w.values = &value_;
w.count = 1;
}
const nvinfer1::Weights& get() { return w; }
private:
float value_;
nvinfer1::Weights w;
};
// The following two API are implemented in TensorRT's header file, cannot load
// from the dynamic library. So create our own implementation and directly
// trigger the method from the dynamic library.
nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
return static_cast<nvinfer1::IBuilder*>(
dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
}
nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
}
const char* kInputTensor = "input";
const char* kOutputTensor = "output";
// Creates a network to compute y = 2x + 3
nvinfer1::IHostMemory* CreateNetwork() {
Logger logger;
// Create the engine.
nvinfer1::IBuilder* builder = createInferBuilder(logger);
ScopedWeights weights(2.);
ScopedWeights bias(3.);
nvinfer1::INetworkDefinition* network = builder->createNetwork();
// Add the input
auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1});
EXPECT_NE(input, nullptr);
// Add the hidden layer.
auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
EXPECT_NE(layer, nullptr);
// Mark the output.
auto output = layer->getOutput(0);
output->setName(kOutputTensor);
network->markOutput(*output);
// Build the engine.
builder->setMaxBatchSize(1);
builder->setMaxWorkspaceSize(1 << 10);
auto engine = builder->buildCudaEngine(*network);
EXPECT_NE(engine, nullptr);
// Serialize the engine to create a model, then close.
nvinfer1::IHostMemory* model = engine->serialize();
network->destroy();
engine->destroy();
builder->destroy();
return model;
}
void Execute(nvinfer1::IExecutionContext& context, const float* input,
float* output) {
const nvinfer1::ICudaEngine& engine = context.getEngine();
// Two binds, input and output
ASSERT_EQ(engine.getNbBindings(), 2);
const int input_index = engine.getBindingIndex(kInputTensor);
const int output_index = engine.getBindingIndex(kOutputTensor);
// Create GPU buffers and a stream
void* buffers[2];
ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
cudaStream_t stream;
ASSERT_EQ(0, cudaStreamCreate(&stream));
// Copy the input to the GPU, execute the network, and copy the output back.
ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
cudaMemcpyHostToDevice, stream));
context.enqueue(1, buffers, stream, nullptr);
ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release the stream and the buffers
cudaStreamDestroy(stream);
ASSERT_EQ(0, cudaFree(buffers[input_index]));
ASSERT_EQ(0, cudaFree(buffers[output_index]));
}
TEST(TensorrtTest, BasicFunction) {
// Create the network serialized model.
nvinfer1::IHostMemory* model = CreateNetwork();
// Use the model to create an engine and an execution context.
Logger logger;
nvinfer1::IRuntime* runtime = createInferRuntime(logger);
nvinfer1::ICudaEngine* engine =
runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
model->destroy();
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
// Execute the network.
float input = 1234;
float output;
Execute(*context, &input, &output);
EXPECT_EQ(output, input * 2 + 3);
// Destroy the engine.
context->destroy();
engine->destroy();
runtime->destroy();
}
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/beam_search_decode_op.h" #include "paddle/fluid/operators/beam_search_decode_op.h"
#include <string>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -87,7 +88,7 @@ struct BeamSearchDecoder { ...@@ -87,7 +88,7 @@ struct BeamSearchDecoder {
*/ */
std::vector<BeamNodeVector<T>> PackTwoSteps( std::vector<BeamNodeVector<T>> PackTwoSteps(
const LoDTensor& cur_ids, const LoDTensor& cur_scores, const LoDTensor& cur_ids, const LoDTensor& cur_scores,
std::vector<BeamNodeVector<T>>& prefixes_list, std::vector<BeamNodeVector<T>>* prefixes_list,
std::vector<SentenceVector<T>>* sentence_vector_list) const; std::vector<SentenceVector<T>>* sentence_vector_list) const;
/** /**
...@@ -140,7 +141,7 @@ Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const { ...@@ -140,7 +141,7 @@ Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
template <typename T> template <typename T>
std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps( std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
const LoDTensor& cur_ids, const LoDTensor& cur_scores, const LoDTensor& cur_ids, const LoDTensor& cur_scores,
std::vector<BeamNodeVector<T>>& prefixes_list, std::vector<BeamNodeVector<T>>* prefixes_list,
std::vector<SentenceVector<T>>* sentence_vector_list) const { std::vector<SentenceVector<T>>* sentence_vector_list) const {
std::vector<BeamNodeVector<T>> result; std::vector<BeamNodeVector<T>> result;
...@@ -153,7 +154,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps( ...@@ -153,7 +154,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
// if prefixes size is 0, it means this is the first step. In this step, // if prefixes size is 0, it means this is the first step. In this step,
// all candidate id is the start of candidate sentences. // all candidate id is the start of candidate sentences.
if (prefixes_list.empty()) { if (prefixes_list->empty()) {
PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(), PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
cur_ids.lod().at(kSentenceLevel).back(), cur_ids.lod().at(kSentenceLevel).back(),
"in the first step"); "in the first step");
...@@ -162,7 +163,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps( ...@@ -162,7 +163,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx]))); cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
} }
} else { } else {
BeamNodeVector<T>& prefixes = prefixes_list[src_idx]; BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx]; SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(), PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
...@@ -262,7 +263,7 @@ void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids, ...@@ -262,7 +263,7 @@ void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
for (size_t step_id = 0; step_id < step_num; ++step_id) { for (size_t step_id = 0; step_id < step_num; ++step_id) {
beamnode_vector_list = beamnode_vector_list =
PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id), PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
beamnode_vector_list, &sentence_vector_list); &beamnode_vector_list, &sentence_vector_list);
} }
// append last beam_node to result // append last beam_node to result
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
......
...@@ -125,7 +125,7 @@ TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) { ...@@ -125,7 +125,7 @@ TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
BeamSearchDecoder<float> helper; BeamSearchDecoder<float> helper;
beamnode_vector_list = helper.PackTwoSteps( beamnode_vector_list = helper.PackTwoSteps(
ids[0], scores[0], beamnode_vector_list, &sentence_vector_list); ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
ASSERT_EQ(beamnode_vector_list.size(), 2UL); ASSERT_EQ(beamnode_vector_list.size(), 2UL);
ASSERT_EQ(beamnode_vector_list[0].size(), 2UL); ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
ASSERT_EQ(beamnode_vector_list[1].size(), 4UL); ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
...@@ -167,7 +167,7 @@ TEST(BeamSearchDecodeOp, PackTwoSteps) { ...@@ -167,7 +167,7 @@ TEST(BeamSearchDecodeOp, PackTwoSteps) {
BeamSearchDecoder<float> helper1; BeamSearchDecoder<float> helper1;
beamnode_vector_list = helper1.PackTwoSteps( beamnode_vector_list = helper1.PackTwoSteps(
ids[0], scores[0], beamnode_vector_list, &sentence_vector_list); ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
ASSERT_EQ(sentence_vector_list[0].size(), 1UL); ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
ASSERT_EQ(sentence_vector_list[1].size(), 0UL); ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
......
...@@ -14,7 +14,10 @@ limitations under the License. */ ...@@ -14,7 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h" #include "paddle/fluid/operators/beam_search_op.h"
#include <algorithm>
#include <map> #include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#endif #endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/chunk_eval_op.h" #include "paddle/fluid/operators/chunk_eval_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,9 @@ limitations under the License. */ ...@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <set> #include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -36,11 +39,11 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -36,11 +39,11 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
}; };
void GetSegments(const int64_t* label, int length, void GetSegments(const int64_t* label, int length,
std::vector<Segment>& segments, int num_chunk_types, std::vector<Segment>* segments, int num_chunk_types,
int num_tag_types, int other_chunk_type, int tag_begin, int num_tag_types, int other_chunk_type, int tag_begin,
int tag_inside, int tag_end, int tag_single) const { int tag_inside, int tag_end, int tag_single) const {
segments.clear(); segments->clear();
segments.reserve(length); segments->reserve(length);
int chunk_start = 0; int chunk_start = 0;
bool in_chunk = false; bool in_chunk = false;
int tag = -1; int tag = -1;
...@@ -58,7 +61,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -58,7 +61,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
i - 1, // end i - 1, // end
prev_type, prev_type,
}; };
segments.push_back(segment); segments->push_back(segment);
in_chunk = false; in_chunk = false;
} }
if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type, if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
...@@ -73,7 +76,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -73,7 +76,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
length - 1, // end length - 1, // end
type, type,
}; };
segments.push_back(segment); segments->push_back(segment);
} }
} }
...@@ -177,8 +180,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -177,8 +180,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
for (int i = 0; i < num_sequences; ++i) { for (int i = 0; i < num_sequences; ++i) {
int seq_length = lod[0][i + 1] - lod[0][i]; int seq_length = lod[0][i + 1] - lod[0][i];
EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length, EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
output_segments, label_segments, *num_infer_chunks_data, &output_segments, &label_segments, num_infer_chunks_data,
*num_label_chunks_data, *num_correct_chunks_data, num_label_chunks_data, num_correct_chunks_data,
num_chunk_types, num_tag_types, other_chunk_type, tag_begin, num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
tag_inside, tag_end, tag_single, excluded_chunk_types); tag_inside, tag_end, tag_single, excluded_chunk_types);
} }
...@@ -197,10 +200,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -197,10 +200,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
} }
void EvalOneSeq(const int64_t* output, const int64_t* label, int length, void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
std::vector<Segment>& output_segments, std::vector<Segment>* output_segments,
std::vector<Segment>& label_segments, std::vector<Segment>* label_segments,
int64_t& num_output_segments, int64_t& num_label_segments, int64_t* num_output_segments, int64_t* num_label_segments,
int64_t& num_correct, int num_chunk_types, int num_tag_types, int64_t* num_correct, int num_chunk_types, int num_tag_types,
int other_chunk_type, int tag_begin, int tag_inside, int other_chunk_type, int tag_begin, int tag_inside,
int tag_end, int tag_single, int tag_end, int tag_single,
const std::set<int>& excluded_chunk_types) const { const std::set<int>& excluded_chunk_types) const {
...@@ -209,25 +212,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -209,25 +212,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
GetSegments(label, length, label_segments, num_chunk_types, num_tag_types, GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
size_t i = 0, j = 0; size_t i = 0, j = 0;
while (i < output_segments.size() && j < label_segments.size()) { while (i < output_segments->size() && j < label_segments->size()) {
if (output_segments[i] == label_segments[j] && if (output_segments->at(i) == label_segments->at(j) &&
excluded_chunk_types.count(output_segments[i].type) != 1) { excluded_chunk_types.count(output_segments->at(i).type) != 1) {
++num_correct; ++(*num_correct);
} }
if (output_segments[i].end < label_segments[j].end) { if (output_segments->at(i).end < label_segments->at(j).end) {
++i; ++i;
} else if (output_segments[i].end > label_segments[j].end) { } else if (output_segments->at(i).end > label_segments->at(j).end) {
++j; ++j;
} else { } else {
++i; ++i;
++j; ++j;
} }
} }
for (auto& segment : label_segments) { for (auto& segment : (*label_segments)) {
if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments; if (excluded_chunk_types.count(segment.type) != 1) {
++(*num_label_segments);
}
} }
for (auto& segment : output_segments) { for (auto& segment : (*output_segments)) {
if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments; if (excluded_chunk_types.count(segment.type) != 1) {
++(*num_output_segments);
}
} }
} }
}; };
......
...@@ -73,9 +73,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -73,9 +73,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto src_memory = auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); mkldnn::memory({src_md, mkldnn_engine},
reinterpret_cast<void*>(const_cast<T*>(input_data)));
auto weights_memory = auto weights_memory =
mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); mkldnn::memory({weights_md, mkldnn_engine},
reinterpret_cast<void*>(const_cast<T*>(filter_data)));
auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd = std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
...@@ -180,8 +182,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -180,8 +182,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
// create memory // create memory
auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine}, auto diff_dst_memory = mkldnn::memory(
(void*)output_grad_data); {diff_weights_md, mkldnn_engine},
reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
// Retrieve conv_pd from device context // Retrieve conv_pd from device context
auto conv_pd = auto conv_pd =
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>( std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
...@@ -198,10 +201,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -198,10 +201,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine); mkldnn_engine);
// create memory // create memory
auto diff_weights_memory = mkldnn::memory( auto diff_weights_memory =
{diff_weights_md, mkldnn_engine}, (void*)filter_grad_data); mkldnn::memory({diff_weights_md, mkldnn_engine},
reinterpret_cast<void*>(filter_grad_data));
auto src_memory = auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); mkldnn::memory({src_md, mkldnn_engine},
reinterpret_cast<void*>(const_cast<T*>(input_data)));
// create backward conv primitive for weights // create backward conv primitive for weights
auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
...@@ -220,10 +225,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -220,10 +225,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
strides, paddings, *conv_pd, mkldnn_engine); strides, paddings, *conv_pd, mkldnn_engine);
// create memory // create memory
auto diff_src_memory = auto diff_src_memory = mkldnn::memory(
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data); {diff_src_md, mkldnn_engine},
reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
auto weights_memory = auto weights_memory =
mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); mkldnn::memory({weights_md, mkldnn_engine},
reinterpret_cast<void*>(const_cast<T*>(filter_data)));
// create backward conv primitive for data // create backward conv primitive for data
auto conv_bwd_data_prim = mkldnn::convolution_backward_data( auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
...@@ -41,9 +42,10 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation, ...@@ -41,9 +42,10 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
return output_size; return output_size;
} }
inline bool IsExpand(std::vector<int64_t>& filter_dim, inline bool IsExpand(const std::vector<int64_t>& filter_dim,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
std::vector<int>& dilations) { const std::vector<int>& paddings,
const std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) { for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1); filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection_map_op.h" #include "paddle/fluid/operators/detection_map_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -82,7 +87,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -82,7 +87,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::vector<std::map<int, std::vector<Box>>> gt_boxes; std::vector<std::map<int, std::vector<Box>>> gt_boxes;
std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes; std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
GetBoxes(*in_label, *in_detect, gt_boxes, detect_boxes); GetBoxes(*in_label, *in_detect, &gt_boxes, detect_boxes);
std::map<int, int> label_pos_count; std::map<int, int> label_pos_count;
std::map<int, std::vector<std::pair<T, int>>> true_pos; std::map<int, std::vector<std::pair<T, int>>> true_pos;
...@@ -95,20 +100,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -95,20 +100,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
} }
if (in_pos_count != nullptr && state) { if (in_pos_count != nullptr && state) {
GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count, GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, &label_pos_count,
true_pos, false_pos, class_num); &true_pos, &false_pos, class_num);
} }
CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult, CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
overlap_threshold, label_pos_count, true_pos, overlap_threshold, &label_pos_count, &true_pos,
false_pos); &false_pos);
int background_label = ctx.Attr<int>("background_label"); int background_label = ctx.Attr<int>("background_label");
T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos, T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
background_label); background_label);
GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count, GetOutputPos(ctx, label_pos_count, true_pos, false_pos, out_pos_count,
*out_true_pos, *out_false_pos, class_num); out_true_pos, out_false_pos, class_num);
T* map_data = out_map->mutable_data<T>(ctx.GetPlace()); T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
map_data[0] = map; map_data[0] = map;
...@@ -155,7 +160,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -155,7 +160,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
void GetBoxes(const framework::LoDTensor& input_label, void GetBoxes(const framework::LoDTensor& input_label,
const framework::LoDTensor& input_detect, const framework::LoDTensor& input_detect,
std::vector<std::map<int, std::vector<Box>>>& gt_boxes, std::vector<std::map<int, std::vector<Box>>>* gt_boxes,
std::vector<std::map<int, std::vector<std::pair<T, Box>>>>& std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
detect_boxes) const { detect_boxes) const {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
...@@ -179,7 +184,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -179,7 +184,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
box.is_difficult = true; box.is_difficult = true;
boxes[label].push_back(box); boxes[label].push_back(box);
} }
gt_boxes.push_back(boxes); gt_boxes->push_back(boxes);
} }
auto detect_index = detect_lod[0]; auto detect_index = detect_lod[0];
...@@ -200,9 +205,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -200,9 +205,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
const std::map<int, int>& label_pos_count, const std::map<int, int>& label_pos_count,
const std::map<int, std::vector<std::pair<T, int>>>& true_pos, const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
const std::map<int, std::vector<std::pair<T, int>>>& false_pos, const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
framework::Tensor& output_pos_count, framework::Tensor* output_pos_count,
framework::LoDTensor& output_true_pos, framework::LoDTensor* output_true_pos,
framework::LoDTensor& output_false_pos, const int class_num) const { framework::LoDTensor* output_false_pos, const int class_num) const {
int true_pos_count = 0; int true_pos_count = 0;
int false_pos_count = 0; int false_pos_count = 0;
for (auto it = true_pos.begin(); it != true_pos.end(); ++it) { for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
...@@ -214,12 +219,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -214,12 +219,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
false_pos_count += fp.size(); false_pos_count += fp.size();
} }
int* pos_count_data = output_pos_count.mutable_data<int>( int* pos_count_data = output_pos_count->mutable_data<int>(
framework::make_ddim({class_num, 1}), ctx.GetPlace()); framework::make_ddim({class_num, 1}), ctx.GetPlace());
T* true_pos_data = output_true_pos.mutable_data<T>( T* true_pos_data = output_true_pos->mutable_data<T>(
framework::make_ddim({true_pos_count, 2}), ctx.GetPlace()); framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
T* false_pos_data = output_false_pos.mutable_data<T>( T* false_pos_data = output_false_pos->mutable_data<T>(
framework::make_ddim({false_pos_count, 2}), ctx.GetPlace()); framework::make_ddim({false_pos_count, 2}), ctx.GetPlace());
true_pos_count = 0; true_pos_count = 0;
false_pos_count = 0; false_pos_count = 0;
...@@ -261,21 +266,21 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -261,21 +266,21 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
framework::LoD false_pos_lod; framework::LoD false_pos_lod;
false_pos_lod.emplace_back(false_pos_starts); false_pos_lod.emplace_back(false_pos_starts);
output_true_pos.set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos.set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return; return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
const framework::LoDTensor& input_true_pos, const framework::LoDTensor& input_true_pos,
const framework::LoDTensor& input_false_pos, const framework::LoDTensor& input_false_pos,
std::map<int, int>& label_pos_count, std::map<int, int>* label_pos_count,
std::map<int, std::vector<std::pair<T, int>>>& true_pos, std::map<int, std::vector<std::pair<T, int>>>* true_pos,
std::map<int, std::vector<std::pair<T, int>>>& false_pos, std::map<int, std::vector<std::pair<T, int>>>* false_pos,
const int class_num) const { const int class_num) const {
const int* pos_count_data = input_pos_count.data<int>(); const int* pos_count_data = input_pos_count.data<int>();
for (int i = 0; i < class_num; ++i) { for (int i = 0; i < class_num; ++i) {
label_pos_count[i] = pos_count_data[i]; (*label_pos_count)[i] = pos_count_data[i];
} }
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
...@@ -291,8 +296,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -291,8 +296,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
} }
}; };
SetData(input_true_pos, true_pos); SetData(input_true_pos, *true_pos);
SetData(input_false_pos, false_pos); SetData(input_false_pos, *false_pos);
return; return;
} }
...@@ -301,9 +306,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -301,9 +306,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>& const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
detect_boxes, detect_boxes,
bool evaluate_difficult, float overlap_threshold, bool evaluate_difficult, float overlap_threshold,
std::map<int, int>& label_pos_count, std::map<int, int>* label_pos_count,
std::map<int, std::vector<std::pair<T, int>>>& true_pos, std::map<int, std::vector<std::pair<T, int>>>* true_pos,
std::map<int, std::vector<std::pair<T, int>>>& false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto image_gt_boxes = gt_boxes[n]; auto image_gt_boxes = gt_boxes[n];
...@@ -320,10 +325,10 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -320,10 +325,10 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
continue; continue;
} }
int label = it->first; int label = it->first;
if (label_pos_count.find(label) == label_pos_count.end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
label_pos_count[label] = count; (*label_pos_count)[label] = count;
} else { } else {
label_pos_count[label] += count; (*label_pos_count)[label] += count;
} }
} }
} }
...@@ -338,8 +343,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -338,8 +343,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
int label = it->first; int label = it->first;
for (size_t i = 0; i < pred_boxes.size(); ++i) { for (size_t i = 0; i < pred_boxes.size(); ++i) {
auto score = pred_boxes[i].first; auto score = pred_boxes[i].first;
true_pos[label].push_back(std::make_pair(score, 0)); (*true_pos)[label].push_back(std::make_pair(score, 0));
false_pos[label].push_back(std::make_pair(score, 1)); (*false_pos)[label].push_back(std::make_pair(score, 1));
} }
} }
continue; continue;
...@@ -351,8 +356,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -351,8 +356,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
if (image_gt_boxes.find(label) == image_gt_boxes.end()) { if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
for (size_t i = 0; i < pred_boxes.size(); ++i) { for (size_t i = 0; i < pred_boxes.size(); ++i) {
auto score = pred_boxes[i].first; auto score = pred_boxes[i].first;
true_pos[label].push_back(std::make_pair(score, 0)); (*true_pos)[label].push_back(std::make_pair(score, 0));
false_pos[label].push_back(std::make_pair(score, 1)); (*false_pos)[label].push_back(std::make_pair(score, 1));
} }
continue; continue;
} }
...@@ -381,17 +386,17 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -381,17 +386,17 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
(!evaluate_difficult && !matched_bboxes[max_idx].is_difficult); (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
if (match_evaluate_difficult) { if (match_evaluate_difficult) {
if (!visited[max_idx]) { if (!visited[max_idx]) {
true_pos[label].push_back(std::make_pair(score, 1)); (*true_pos)[label].push_back(std::make_pair(score, 1));
false_pos[label].push_back(std::make_pair(score, 0)); (*false_pos)[label].push_back(std::make_pair(score, 0));
visited[max_idx] = true; visited[max_idx] = true;
} else { } else {
true_pos[label].push_back(std::make_pair(score, 0)); (*true_pos)[label].push_back(std::make_pair(score, 0));
false_pos[label].push_back(std::make_pair(score, 1)); (*false_pos)[label].push_back(std::make_pair(score, 1));
} }
} }
} else { } else {
true_pos[label].push_back(std::make_pair(score, 0)); (*true_pos)[label].push_back(std::make_pair(score, 0));
false_pos[label].push_back(std::make_pair(score, 1)); (*false_pos)[label].push_back(std::make_pair(score, 1));
} }
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/edit_distance_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <fstream>
#include <ostream> #include <ostream>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
...@@ -67,7 +68,7 @@ ListenAndServOp::ListenAndServOp(const std::string &type, ...@@ -67,7 +68,7 @@ ListenAndServOp::ListenAndServOp(const std::string &type,
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
int ListenAndServOp::GetSelectedPort() { int ListenAndServOp::GetSelectedPort() const {
return rpc_service_->GetSelectedPort(); return rpc_service_->GetSelectedPort();
} }
...@@ -99,7 +100,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -99,7 +100,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
std::vector<int> block_list; std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) { for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
if (blkid != prefetch_block->ID()) { if (blkid != static_cast<size_t>(prefetch_block->ID())) {
block_list.push_back(blkid); block_list.push_back(blkid);
} }
} }
...@@ -121,10 +122,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -121,10 +122,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
rpc_service_->SetProgram(program); rpc_service_->SetProgram(program);
// start the server listening after all member initialized. // start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_)); server_thread_.reset(new std::thread(RunServer, rpc_service_));
// FIXME(typhoonzero): do we need to wait until the server port is ready? VLOG(3) << "wait server thread to become ready...";
sleep(5); sleep(5);
// Write to a file of server selected port for python use.
std::ofstream port_file;
port_file.open("/tmp/paddle.selected_port");
port_file << rpc_service_->GetSelectedPort();
port_file.close();
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false; bool exit_flag = false;
// Record received sparse variables, so that // Record received sparse variables, so that
// we could reset those after execute optimize program // we could reset those after execute optimize program
...@@ -175,7 +180,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -175,7 +180,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
parallel_blkids.push_back(1); parallel_blkids.push_back(1);
double ts = detail::GetTimestamp(); double ts = detail::GetTimestamp();
for (size_t blkid = 2; blkid < num_blocks; ++blkid) { for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
if (blkid != prefetch_block->ID()) { if (blkid != static_cast<size_t>(prefetch_block->ID())) {
if (program->Block(blkid).Parent() != last_parent_blkid) { if (program->Block(blkid).Parent() != last_parent_blkid) {
ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared, ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
program, &recv_scope); program, &recv_scope);
......
...@@ -39,7 +39,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -39,7 +39,7 @@ class ListenAndServOp : public framework::OperatorBase {
const framework::VariableNameMap &outputs, const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs); const framework::AttributeMap &attrs);
int GetSelectedPort(); int GetSelectedPort() const;
void Stop() override; void Stop() override;
......
...@@ -139,7 +139,6 @@ void StartServerNet(bool is_sparse) { ...@@ -139,7 +139,6 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"PrefetchBlock", prefetch_block});
listen_and_serv_op = listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
LOG(INFO) << "selected port before run " << selected_port;
listen_and_serv_op->Run(scope, place); listen_and_serv_op->Run(scope, place);
LOG(INFO) << "server exit"; LOG(INFO) << "server exit";
} }
...@@ -158,16 +157,13 @@ TEST(SendRecvOp, CPUDense) { ...@@ -158,16 +157,13 @@ TEST(SendRecvOp, CPUDense) {
selected_port = static_cast<paddle::operators::ListenAndServOp *>( selected_port = static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get()) listen_and_serv_op.get())
->GetSelectedPort(); ->GetSelectedPort();
LOG(INFO) << "selected port " << selected_port;
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})}); attrs.insert({"epmap", std::vector<std::string>({endpoint})});
auto send_op = f::OpRegistry::CreateOp( auto send_op = f::OpRegistry::CreateOp(
"send", {{"X", {"x1"}}}, "send", {{"X", {"x1"}}},
{{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs); {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
LOG(INFO) << "before run " << endpoint;
send_op->Run(scope, place); send_op->Run(scope, place);
LOG(INFO) << "end run";
auto in_var = scope.Var("x1"); auto in_var = scope.Var("x1");
auto tensor = in_var->GetMutable<f::LoDTensor>(); auto tensor = in_var->GetMutable<f::LoDTensor>();
...@@ -180,7 +176,6 @@ TEST(SendRecvOp, CPUDense) { ...@@ -180,7 +176,6 @@ TEST(SendRecvOp, CPUDense) {
for (int64_t i = 0; i < target->numel(); ++i) { for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]); EXPECT_EQ(expected[i] * 2, actual[i]);
} }
LOG(INFO) << "before stop";
listen_and_serv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
listen_and_serv_op.reset(nullptr); listen_and_serv_op.reset(nullptr);
...@@ -199,7 +194,6 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -199,7 +194,6 @@ TEST(SendRecvOp, CPUSparse) {
selected_port = static_cast<paddle::operators::ListenAndServOp *>( selected_port = static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get()) listen_and_serv_op.get())
->GetSelectedPort(); ->GetSelectedPort();
LOG(INFO) << "selected port " << selected_port;
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})}); attrs.insert({"epmap", std::vector<std::string>({endpoint})});
......
...@@ -73,6 +73,15 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -73,6 +73,15 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
softmax_dst_memory); softmax_dst_memory);
std::vector<primitive> pipeline{softmax}; std::vector<primitive> pipeline{softmax};
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
const bool is_test = ctx.Attr<bool>("is_test");
if (!is_test) {
T threshold = exp(-64);
for (size_t i = 0; i < dst_tz[0] * dst_tz[1]; ++i) {
output_data[i] =
output_data[i] < threshold ? threshold : output_data[i];
}
}
} }
}; };
......
...@@ -97,6 +97,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -97,6 +97,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("is_test",
"Disable epsilon adding to softmax results. Used by MKLDNN.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Softmax Operator. Softmax Operator.
......
...@@ -24,7 +24,6 @@ namespace paddle { ...@@ -24,7 +24,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
...@@ -36,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -36,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor // Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor). // FIXME: only deal with matrix(2d tensor).
auto* input = ctx.Input<LoDTensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<LoDTensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<LoDTensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
// k is determined by Attr // k is determined by Attr
const size_t k = static_cast<int>(ctx.Attr<int>("k")); const size_t k = static_cast<int>(ctx.Attr<int>("k"));
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
if (TENSORRT_FOUND)
list(APPEND CUDA_SRCS tensorrt.cc)
endif()
configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
if (CUPTI_FOUND) if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
......
...@@ -45,6 +45,10 @@ DEFINE_string(nccl_dir, "", ...@@ -45,6 +45,10 @@ DEFINE_string(nccl_dir, "",
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
DEFINE_string(
tensorrt_dir, "",
"Specify path for loading tensorrt library, such as libnvinfer.so.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
...@@ -194,6 +198,14 @@ void* GetNCCLDsoHandle() { ...@@ -194,6 +198,14 @@ void* GetNCCLDsoHandle() {
#endif #endif
} }
void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
#endif
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,7 @@ void* GetCurandDsoHandle(); ...@@ -25,6 +25,7 @@ void* GetCurandDsoHandle();
void* GetWarpCTCDsoHandle(); void* GetWarpCTCDsoHandle();
void* GetLapackDsoHandle(); void* GetLapackDsoHandle();
void* GetNCCLDsoHandle(); void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle();
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/tensorrt.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag tensorrt_dso_flag;
void *tensorrt_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <NvInfer.h>
#include <dlfcn.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag tensorrt_dso_flag;
extern void* tensorrt_dso_handle;
#ifdef PADDLE_USE_DSO
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using tensorrt_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = \
paddle::platform::dynload::GetTensorRtDsoHandle(); \
PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
}); \
void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \
PADDLE_ENFORCE(p_##__name, "load %s failed", #__name); \
return reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#else
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
tensorrtResult_t operator()(Args... args) { \
return __name(args...); \
} \
}; \
extern DynLoad__##__name __name
#endif
#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
__macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL);
TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
...@@ -873,6 +873,11 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) { ...@@ -873,6 +873,11 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
return !((isnan)(a)) && !((isinf)(a)); return !((isnan)(a)) && !((isinf)(a));
} }
inline std::ostream& operator<<(std::ostream& os, const float16& a) {
os << static_cast<float>(a);
return os;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -141,5 +141,10 @@ TEST(float16, lod_tensor_cpu) { ...@@ -141,5 +141,10 @@ TEST(float16, lod_tensor_cpu) {
} }
} }
TEST(float16, print) {
float16 a = float16(1.0f);
std::cout << a << std::endl;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { ...@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
class NCCLGroupGuard { class NCCLGroupGuard {
public: public:
static std::mutex &NCCLMutex() {
static std::mutex mtx;
return mtx;
}
inline NCCLGroupGuard() { inline NCCLGroupGuard() {
mutex().lock(); NCCLMutex().lock();
PADDLE_ENFORCE(dynload::ncclGroupStart()); PADDLE_ENFORCE(dynload::ncclGroupStart());
} }
inline ~NCCLGroupGuard() { inline ~NCCLGroupGuard() {
PADDLE_ENFORCE(dynload::ncclGroupEnd()); PADDLE_ENFORCE(dynload::ncclGroupEnd());
mutex().unlock(); NCCLMutex().unlock();
}
private:
static std::mutex &mutex() {
static std::mutex mtx;
return mtx;
} }
}; };
...@@ -68,26 +67,6 @@ struct NCCLContext { ...@@ -68,26 +67,6 @@ struct NCCLContext {
int device_id() const { int device_id() const {
return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device; return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
} }
static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
const std::vector<platform::Place> &places) {
std::vector<ncclComm_t> comms;
std::vector<int> devs;
comms.resize(contexts->size());
devs.reserve(contexts->size());
for (auto &p : places) {
devs.push_back(boost::get<platform::CUDAPlace>(p).device);
}
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(contexts->size()), &devs[0]));
int i = 0;
for (auto &dev_id : devs) {
contexts->at(dev_id).comm_ = comms[i++];
}
}
}; };
struct NCCLContextMap { struct NCCLContextMap {
...@@ -107,12 +86,12 @@ struct NCCLContextMap { ...@@ -107,12 +86,12 @@ struct NCCLContextMap {
"NCCL Context Map does not support contain two or more same device"); "NCCL Context Map does not support contain two or more same device");
if (places.size() > 1) { if (places.size() > 1) {
std::vector<ncclComm_t> comms; std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
comms.resize(order_.size()); {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(order_.size()), &order_[0])); comms.get(), static_cast<int>(order_.size()), order_.data()));
}
int i = 0; int i = 0;
for (auto &dev_id : order_) { for (auto &dev_id : order_) {
contexts_.at(dev_id).comm_ = comms[i++]; contexts_.at(dev_id).comm_ = comms[i++];
...@@ -120,6 +99,9 @@ struct NCCLContextMap { ...@@ -120,6 +99,9 @@ struct NCCLContextMap {
} }
} }
NCCLContextMap(const NCCLContextMap &other) = delete;
NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
CUDADeviceContext *DevCtx(platform::Place p) const { CUDADeviceContext *DevCtx(platform::Place p) const {
......
...@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle.
scope, local_scopes, allow_op_delay); scope, local_scopes, allow_op_delay);
}) })
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// one by one and mark them as reference.
.def("local_scopes", .def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * { [](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes(); return &self.GetLocalScopes();
}, },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("feed_tensors_into_local_scopes",
&ParallelExecutor::FeedTensorsIntoLocalScopes)
.def("feed_and_split_tensor_into_local_scopes",
&ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
.def("run", &ParallelExecutor::Run); .def("run", &ParallelExecutor::Run);
BindRecordIOWriter(&m); BindRecordIOWriter(&m);
......
...@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray( ...@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
static_cast<const platform::CUDADeviceContext *>(pool.Get(place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
} }
template <> template <>
...@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray( ...@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
paddle::platform::GpuMemcpyAsync(dst, array.data(), paddle::platform::GpuMemcpyAsync(dst, array.data(),
sizeof(uint16_t) * array.size(), sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
} }
template <typename T> template <typename T>
......
...@@ -198,7 +198,7 @@ EOF ...@@ -198,7 +198,7 @@ EOF
# run paddle version to install python packages first # run paddle version to install python packages first
RUN apt-get update &&\ RUN apt-get update &&\
${NCCL_DEPS}\ ${NCCL_DEPS}\
apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \ apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
pip install /*.whl; apt-get install -f -y && \ pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \ apt-get clean -y && \
rm -f /*.whl && \ rm -f /*.whl && \
......
...@@ -32,6 +32,8 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); ...@@ -32,6 +32,8 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
static inline std::string join(const std::string& part1, static inline std::string join(const std::string& part1,
const std::string& part2) { const std::string& part2) {
// directory separator // directory separator
...@@ -157,3 +159,12 @@ void GetLapackDsoHandle(void** dso_handle) { ...@@ -157,3 +159,12 @@ void GetLapackDsoHandle(void** dso_handle) {
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
#endif #endif
} }
void GetTensorRtDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(
FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
#endif
}
...@@ -58,3 +58,11 @@ void GetWarpCTCDsoHandle(void** dso_handle); ...@@ -58,3 +58,11 @@ void GetWarpCTCDsoHandle(void** dso_handle);
* *
*/ */
void GetLapackDsoHandle(void** dso_handle); void GetLapackDsoHandle(void** dso_handle);
/**
* @brief load the DSO of tensorrt
*
* @param **dso_handle dso handler
*
*/
void GetTensorRtDsoHandle(void** dso_handle);
...@@ -37,6 +37,7 @@ from distribute_transpiler import DistributeTranspiler ...@@ -37,6 +37,7 @@ from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select) channel_close, Select)
from inference_transpiler import InferenceTranspiler
import clip import clip
from memory_optimization_transpiler import memory_optimize, release_memory from memory_optimization_transpiler import memory_optimize, release_memory
import profiler import profiler
...@@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ ...@@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
'clip', 'clip',
'SimpleDistributeTranspiler', 'SimpleDistributeTranspiler',
'DistributeTranspiler', 'DistributeTranspiler',
'InferenceTranspiler',
'memory_optimize', 'memory_optimize',
'release_memory', 'release_memory',
'profiler', 'profiler',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from framework import Program
from executor import global_scope
from . import core
class InferenceTranspiler:
def transpile(self, program, place, scope=None):
'''
Transpile the program. Support only fuse batch normalization now.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope or None
'''
if not isinstance(program, Program):
raise TypeError("program should be as Program type")
if not isinstance(place, core.CPUPlace) and not isinstance(
place, core.CUDAPlace):
raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None:
scope = global_scope()
if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope)
def fuse_batch_norm(self, program, place, scope):
'''
Transpile the program by fused batch normalization.
The batch normalization followed the convolution or fully connected layer
can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded.
For input X:
- Conv process: X = input * W + bias
- Batch norm process: X' = (X - mean) / std
- Scale Process: Y = a * X' + b
After fuse into one operation:
Y = (input * W + bias - mean) / std * a + b
= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is:
- before:
- conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after:
- conv->elementwise_add->any_other_op
The transpile stages are:
1. insert elementwise_add op when bias == 0.
2. fuse the batch_norm's parameters to conv and elementwise_add operators.
3. remove batch_norm ops which are not used in any other ops.
4. adjust the input of any_other_op to be the output of elementwise_add operator.
5. remove unused variables.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope
'''
self.scope = scope
self.place = place
self.block = program.block(0)
self.input_map = {} # store the input names should be adjusted
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
# TODO(luotao1): consider only conv2d now. fc would be delt later.
if current_op.type in ['conv2d']:
# TODO(luotao1): consider single chain network now.
# For branch network, we counldn't use block.ops[i + 1] as
# the judgment condition.
next_op = self.block.ops[i + 1]
# conv2d without bias
if (next_op.type == 'batch_norm'):
# insert bias op
bias_op = self._insert_bias_op(i + 1, current_op, next_op)
# fuse batch_norm
self._fuse_param(current_op, next_op, bias_op, 0)
# remove batch_norm_op
self.block.remove_op(i + 2)
i = i + 1
# conv2d with bias, the next_op.type is elementwise_add
elif (next_op.type == 'elementwise_add'):
next_next_op = self.block.ops[i + 2]
if (next_next_op.type == 'batch_norm'):
# fuse batch_norm
self._fuse_param(current_op, next_next_op, next_op, 1)
# remove batch_norm_op
self.block.remove_op(i + 2)
i = i + 1
i = i + 1
self._adjust_input()
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
# ====================== private transpiler functions =====================
def _insert_bias_op(self, index, current_op, bn_op):
'''
Construct elementwise_add operator for adding bias
and insert it into program.
:param index: insert location of bias_op
:type index: Int
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:return: bias_op
:rtype: Operator
'''
# The input of bias_op is current_op's output and Bias of bn_op
# The output of bias_op is bn_op's output
x_var = self.block.var(current_op.output("Output")[0])
y_var = self.block.var(bn_op.input("Bias")[0])
out_var = self.block.var(bn_op.output("Y")[0])
bias_op = self.block.insert_op(
index,
type="elementwise_add",
inputs={"X": x_var,
"Y": y_var},
outputs={"Out": out_var},
attrs={"axis": 1}) # dim_start=1
return bias_op
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
'''
fuse the batch_norm_op' parameters to current_op (conv or fc)
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:param bias_op: elementwise_add operator for adding bias
:type bias_op: Operator
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
:type with_bias: Int
'''
def _update_param(op, old_param_name, new_param):
# For the sake of remaining the original variables the same as before,
# create new variables in scope to store the new parameters.
old_param_name = old_param_name[0]
old_var = self.block.vars[old_param_name]
new_param_name = old_param_name + '_fuse_bn'
new_var = self.block.create_parameter(
name=new_param_name.encode('ascii'),
type=old_var.type,
dtype=old_var.dtype,
shape=old_var.shape)
op.rename_input(old_param_name, new_param_name)
self.scope.var(new_param_name)
tensor = self.scope.find_var(new_param_name).get_tensor()
tensor.set(np.array(new_param), self.place)
def _load_param(param_name):
return np.array(self.scope.find_var(param_name[0]).get_tensor())
bias_bn = _load_param(bn_op.input("Bias")) #Bias
scale_bn = _load_param(bn_op.input("Scale")) #Scale
mean_bn = _load_param(bn_op.input("Mean")) #Mean
var_bn = _load_param(bn_op.input("Variance")) #Variance
# TODO(luotao1): consider only conv2d now. fc would be delt later.
current_param = _load_param(current_op.input("Filter"))
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
tmp = np.float32(np.divide(scale_bn, std_bn))
# add bias of batch_norm_op to conv2d
if with_bias:
bias = _load_param(bias_op.input("Y"))
else:
bias = np.zeros(bias_bn.shape)
bias = np.float32(
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
# re-compute weight of conv2d
tmp = tmp.reshape(tmp.shape[0], -1)
dst_param = current_param.reshape((tmp.shape[0], -1))
dst_param = np.float32(np.multiply(dst_param, tmp))
dst_param = dst_param.reshape(current_param.shape)
# update parameters
_update_param(current_op, current_op.input("Filter"), dst_param)
_update_param(bias_op, bias_op.input("Y"), bias)
# collect the renamed input
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
def _adjust_input(self):
for i in range(len(self.block.ops)):
current_op = self.block.ops[i]
for input_arg in current_op.input_arg_names:
if input_arg in self.input_map:
current_op.rename_input(input_arg,
self.input_map[input_arg])
def _remove_unused_var(self):
'''
remove unused varibles in program
'''
args = []
for i in range(len(self.block.ops)):
current_op = self.block.ops[i]
args += current_op.input_arg_names
args += current_op.output_arg_names
args = list(set(args)) # unique the input and output arguments
for var in self.block.vars.keys():
if var not in args:
self.block.remove_var(var)
...@@ -32,7 +32,6 @@ __all__ = [ ...@@ -32,7 +32,6 @@ __all__ = [
'Switch', 'Switch',
'lod_rank_table', 'lod_rank_table',
'max_sequence_len', 'max_sequence_len',
'topk',
'lod_tensor_to_array', 'lod_tensor_to_array',
'array_to_lod_tensor', 'array_to_lod_tensor',
'increment', 'increment',
...@@ -751,43 +750,6 @@ def max_sequence_len(rank_table): ...@@ -751,43 +750,6 @@ def max_sequence_len(rank_table):
return res return res
def topk(input, k):
"""
**topk**
This function performs the operation that selects the k entries in the input
vector and outputs their values and indices as vectors. Thus topk_out[j] is
the j-th largest entry in input, and its index is topk_indices[j]
Args:
input (Variable|list): The input tensor that has all the data.
k (int): The number of top elements that the function will pick.
Returns:
Variable: The variable of type array that contains the k largest entries
from input.
Variable: The variable of type array that contains the indices of k
largest entries from input.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10])
k = 5
array = fluid.layers.topk(x, k)
"""
helper = LayerHelper('topk', **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype='int64')
helper.append_op(
type='top_k',
inputs={'X': [input]},
outputs={'Out': [topk_out],
'Indices': [topk_indices]},
attrs={'k': k})
return topk_out, topk_indices
def lod_tensor_to_array(x, table): def lod_tensor_to_array(x, table):
""" Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY. """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from .. import core from .. import core
from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
from ..unique_name import generate as unique_name from ..unique_name import generate as unique_name
from control_flow import BlockGuard from control_flow import BlockGuard
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
...@@ -158,6 +158,7 @@ class ListenAndServ(object): ...@@ -158,6 +158,7 @@ class ListenAndServ(object):
main_program = self.helper.main_program main_program = self.helper.main_program
current_block = main_program.current_block() current_block = main_program.current_block()
parent_block = self.parent_block() parent_block = self.parent_block()
empty_block = Program().global_block()
parent_block.append_op( parent_block.append_op(
type='listen_and_serv', type='listen_and_serv',
...@@ -166,11 +167,12 @@ class ListenAndServ(object): ...@@ -166,11 +167,12 @@ class ListenAndServ(object):
attrs={ attrs={
'endpoint': self.endpoint, 'endpoint': self.endpoint,
'Fanin': self.fan_in, 'Fanin': self.fan_in,
'OptimizeBlock': current_block 'OptimizeBlock': current_block,
'PrefetchBlock': empty_block
}) })
def Send(endpoints, send_vars, get_vars): def Send(endpoints, send_vars, get_vars=None):
""" """
Send layer Send layer
...@@ -184,7 +186,6 @@ def Send(endpoints, send_vars, get_vars): ...@@ -184,7 +186,6 @@ def Send(endpoints, send_vars, get_vars):
side when server have finished running server side program. side when server have finished running server side program.
""" """
assert (type(send_vars) == list) assert (type(send_vars) == list)
assert (type(get_vars) == list)
epmap = endpoints.split(",") epmap = endpoints.split(",")
endpoints = list(set(epmap)) endpoints = list(set(epmap))
...@@ -192,6 +193,11 @@ def Send(endpoints, send_vars, get_vars): ...@@ -192,6 +193,11 @@ def Send(endpoints, send_vars, get_vars):
helper = LayerHelper("Send", **locals()) helper = LayerHelper("Send", **locals())
rpc_client_var = default_main_program().global_block().create_var( rpc_client_var = default_main_program().global_block().create_var(
name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW) name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
if not get_vars:
get_vars = []
for s in send_vars:
v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
get_vars.append(v)
helper.append_op( helper.append_op(
type="send", type="send",
...@@ -200,6 +206,7 @@ def Send(endpoints, send_vars, get_vars): ...@@ -200,6 +206,7 @@ def Send(endpoints, send_vars, get_vars):
"RPCClient": rpc_client_var}, "RPCClient": rpc_client_var},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
return get_vars
def Recv(endpoints, get_vars): def Recv(endpoints, get_vars):
......
...@@ -20,7 +20,7 @@ from ..initializer import init_on_cpu ...@@ -20,7 +20,7 @@ from ..initializer import init_on_cpu
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
'polynomial_decay', 'piecewise_decay' 'polynomial_decay', 'piecewise_decay', 'noam_decay'
] ]
""" """
When training a model, it's often useful to decay the When training a model, it's often useful to decay the
...@@ -32,14 +32,41 @@ strategy according to this module. ...@@ -32,14 +32,41 @@ strategy according to this module.
""" """
def _decay_step_counter(): def _decay_step_counter(begin=0):
# the first global step is zero in learning rate decay # the first global step is zero in learning rate decay
global_step = nn.autoincreased_step_counter( global_step = nn.autoincreased_step_counter(
counter_name='@LR_DECAY_COUNTER@', begin=0, step=1) counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
global_step = tensor.cast(global_step, 'float32') global_step = tensor.cast(global_step, 'float32')
return global_step return global_step
def noam_decay(d_model, warmup_steps):
"""Apply decay to learning rate.
```python
lr_value = np.power(d_model, -0.5) * np.min([
np.power(current_steps, -0.5),
np.power(warmup_steps, -1.5) * current_steps
])
```
Args:
d_model(Variable): The dimensionality of input and output of model.
Reference: attention is all you need
https://arxiv.org/pdf/1706.03762.pdf
warmup_steps(Variable): A super parameter.
Returns:
The decayed learning rate.
"""
global_step = _decay_step_counter(1)
with init_on_cpu():
a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
return lr_value
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
"""Applies exponential decay to the learning rate. """Applies exponential decay to the learning rate.
......
...@@ -20,6 +20,7 @@ from ..layer_helper import LayerHelper ...@@ -20,6 +20,7 @@ from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant from ..initializer import Normal, Constant
from ..framework import Variable from ..framework import Variable
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
import nn
__all__ = ['accuracy', 'auc'] __all__ = ['accuracy', 'auc']
...@@ -27,17 +28,10 @@ __all__ = ['accuracy', 'auc'] ...@@ -27,17 +28,10 @@ __all__ = ['accuracy', 'auc']
def accuracy(input, label, k=1, correct=None, total=None): def accuracy(input, label, k=1, correct=None, total=None):
""" """
This function computes the accuracy using the input and label. This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices. The output is the top k inputs and their indices.
""" """
helper = LayerHelper("accuracy", **locals()) helper = LayerHelper("accuracy", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype) topk_out, topk_indices = nn.topk(input, k=k)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
acc_out = helper.create_tmp_variable(dtype="float32") acc_out = helper.create_tmp_variable(dtype="float32")
if correct is None: if correct is None:
correct = helper.create_tmp_variable(dtype="int64") correct = helper.create_tmp_variable(dtype="int64")
...@@ -68,12 +62,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): ...@@ -68,12 +62,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype) topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64") topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op( topk_out, topk_indices = nn.topk(input, k=k)
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
auc_out = helper.create_tmp_variable(dtype="float32") auc_out = helper.create_tmp_variable(dtype="float32")
if correct is None: if correct is None:
correct = helper.create_tmp_variable(dtype="int64") correct = helper.create_tmp_variable(dtype="int64")
......
...@@ -60,6 +60,7 @@ __all__ = [ ...@@ -60,6 +60,7 @@ __all__ = [
'edit_distance', 'edit_distance',
'l2_normalize', 'l2_normalize',
'matmul', 'matmul',
'topk',
'warpctc', 'warpctc',
'sequence_reshape', 'sequence_reshape',
'transpose', 'transpose',
...@@ -88,6 +89,7 @@ def fc(input, ...@@ -88,6 +89,7 @@ def fc(input,
bias_attr=None, bias_attr=None,
use_mkldnn=False, use_mkldnn=False,
act=None, act=None,
is_test=False,
name=None): name=None):
""" """
**Fully Connected Layer** **Fully Connected Layer**
...@@ -134,6 +136,7 @@ def fc(input, ...@@ -134,6 +136,7 @@ def fc(input,
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to None, no bias will be added to the output units. of this layer. If it is set to None, no bias will be added to the output units.
act (str, default None): Activation to be applied to the output of this layer. act (str, default None): Activation to be applied to the output of this layer.
is_test(bool): A flag indicating whether execution is in test phase.
use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
library is installed. Default: False library is installed. Default: False
name (str, default None): The name of this layer. name (str, default None): The name of this layer.
...@@ -177,8 +180,11 @@ def fc(input, ...@@ -177,8 +180,11 @@ def fc(input,
inputs={"Input": input, inputs={"Input": input,
"W": w}, "W": w},
outputs={"Out": tmp}, outputs={"Out": tmp},
attrs={"use_mkldnn": use_mkldnn, attrs={
"bias_attr": bias_attr}) "use_mkldnn": use_mkldnn,
"is_test": is_test,
"bias_attr": bias_attr
})
return helper.append_activation(tmp) return helper.append_activation(tmp)
else: else:
for input_var, param_attr in helper.iter_inputs_and_params(): for input_var, param_attr in helper.iter_inputs_and_params():
...@@ -2571,6 +2577,53 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): ...@@ -2571,6 +2577,53 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
return out return out
def topk(input, k):
"""
This operator is used to find values and indices of the k largest entries
for the last dimension.
If the input is a vector (rank=1), finds the k largest entries in the vector
and outputs their values and indices as vectors. Thus values[j] is the j-th
largest entry in input, and its index is indices[j].
If the input is a Tensor with higher rank, this operator computes the top k
entries along the last dimension.
Args:
input(Variable): The input variable which can be a vector or Tensor with
higher rank.
k(int): An integer value to specify the top k largest elements.
Returns:
values(Variable): The k largest elements along each last dimensional
slice.
indices(Variable): The indices of values within the last dimension of
input.
Examples:
.. code-block:: python
top5_values, top5_indices = layers.topk(input, k=5)
"""
shape = input.shape
if k < 1 and k >= shape[-1]:
raise ValueError("k must be greater than 0 and less than %d." %
(shape[-1]))
helper = LayerHelper("top_k", **locals())
values = helper.create_tmp_variable(dtype=input.dtype)
indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [values],
"Indices": [indices]},
attrs={"k": k})
values.stop_gradient = True
indices.stop_gradient = True
return values, indices
def edit_distance(input, label, normalized=True, ignored_tokens=None, def edit_distance(input, label, normalized=True, ignored_tokens=None,
name=None): name=None):
""" """
...@@ -2712,15 +2765,7 @@ def ctc_greedy_decoder(input, blank, name=None): ...@@ -2712,15 +2765,7 @@ def ctc_greedy_decoder(input, blank, name=None):
cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0) cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
""" """
helper = LayerHelper("ctc_greedy_decoder", **locals()) helper = LayerHelper("ctc_greedy_decoder", **locals())
# top 1 op _, topk_indices = topk(input, k=1)
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": 1})
# ctc align op # ctc align op
ctc_out = helper.create_tmp_variable(dtype="int64") ctc_out = helper.create_tmp_variable(dtype="int64")
......
...@@ -16,6 +16,8 @@ import core ...@@ -16,6 +16,8 @@ import core
import multiprocessing import multiprocessing
import framework import framework
import executor import executor
import warnings
import sys
__all__ = ['ParallelExecutor'] __all__ = ['ParallelExecutor']
...@@ -61,8 +63,8 @@ class ParallelExecutor(object): ...@@ -61,8 +63,8 @@ class ParallelExecutor(object):
main_program=test_program, main_program=test_program,
share_vars_from=train_exe) share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) train_loss, = train_exe.run([loss.name], feed=feed_dict)
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) test_loss, = test_exe.run([loss.name], feed=feed_dict)
""" """
self._places = [] self._places = []
...@@ -102,8 +104,8 @@ class ParallelExecutor(object): ...@@ -102,8 +104,8 @@ class ParallelExecutor(object):
self.persistable_vars = [ self.persistable_vars = [
v.name v.name
for v in filter(lambda var: \ for v in filter(
var.persistable and var.type != core.VarDesc.VarType.RAW, lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
main.list_vars()) main.list_vars())
] ]
...@@ -123,28 +125,93 @@ class ParallelExecutor(object): ...@@ -123,28 +125,93 @@ class ParallelExecutor(object):
allow_op_delay) allow_op_delay)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed_dict={}): def run(self, fetch_list, feed=None, feed_dict=None):
""" """
:param fetch_list: A list of variable names that will be fetched. Run a parallel executor with fetch_list.
:param feed_dict: A dict mapping for feed variable name to LoDTensor
or numpy array. The feed parameter can be a dict or a list. If feed is a dict, the
:return: fetched value list. feed data will be split into multiple devices. If feed is a list, we
""" assume the data has been splitted into multiple devices, the each
if not isinstance(feed_dict, dict): element in the list will be copied to each device directly.
raise TypeError("feed_dict should be a dict")
For example, if the feed is a dict:
>>> exe = ParallelExecutor()
>>> # the image will be splitted into devices. If there is two devices
>>> # each device will process an image with shape (24, 1, 28, 28)
>>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
For example, if the feed is a list:
>>> exe = ParallelExecutor()
>>> # each device will process each element in the list.
>>> # the 1st device will process an image with shape (48, 1, 28, 28)
>>> # the 2nd device will process an image with shape (32, 1, 28, 28)
>>> #
>>> # you can use exe.device_count to get the device number.
>>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
>>> {"image": numpy.random.random(size=(32, 1, 28, 28))},
>>> ])
Args:
fetch_list(list): The fetched variable names
feed(list|dict|None): The feed variables. If the feed is a dict,
tensors in that dict will be splitted into each devices. If
the feed is a list, each element of the list will be copied
to each device.
feed_dict: Alias for feed parameter, for backward compatibility.
This parameter is deprecated.
feed_tensor_dict = {} Returns: fetched result list.
for i, feed_name in enumerate(feed_dict):
feed_tensor = feed_dict[feed_name] """
if not isinstance(feed_tensor, core.LoDTensor): if feed is None and feed_dict is not None:
feed_tensor = core.LoDTensor() feed = feed_dict
feed_tensor.set(feed_dict[feed_name], self._act_places[0]) print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
feed_tensor_dict[feed_name] = feed_tensor
if isinstance(feed, dict):
feed_tensor_dict = dict()
for feed_name in feed:
feed_tensor = feed[feed_name]
if not isinstance(feed_tensor, core.LoDTensor):
feed_tensor = core.LoDTensor()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor.set(feed[feed_name], core.CPUPlace())
feed_tensor_dict[feed_name] = feed_tensor
self.executor.feed_and_split_tensor_into_local_scopes(
feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(self._act_places):
raise ValueError(
"Feed a list of tensor, the list should be the same size as places"
)
res = list()
for i, each in enumerate(feed):
if not isinstance(each, dict):
raise TypeError(
"Each element of feed list should be a dict")
res_dict = dict()
for feed_name in each:
tensor = each[feed_name]
if not isinstance(tensor, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(tensor, self._act_places[i])
tensor = tmp
res_dict[feed_name] = tensor
res.append(res_dict)
self.executor.feed_tensors_into_local_scopes(res)
fetch_var_name = '@FETCHED_VAR_NAME@' fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict) self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def bcast_params(self):
self.executor.bcast_params(set(self.persistable_vars)) self.executor.bcast_params(set(self.persistable_vars))
@property
def device_count(self):
return len(self._act_places)
...@@ -22,10 +22,17 @@ import sys ...@@ -22,10 +22,17 @@ import sys
import numpy import numpy
import unittest import unittest
import os import os
import numpy as np
def resnet_cifar10(input, depth=32): def resnet_cifar10(input, depth=32):
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d( tmp = fluid.layers.conv2d(
input=input, input=input,
filter_size=filter_size, filter_size=filter_size,
...@@ -33,7 +40,7 @@ def resnet_cifar10(input, depth=32): ...@@ -33,7 +40,7 @@ def resnet_cifar10(input, depth=32):
stride=stride, stride=stride,
padding=padding, padding=padding,
act=None, act=None,
bias_attr=False) bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act) return fluid.layers.batch_norm(input=tmp, act=act)
def shortcut(input, ch_in, ch_out, stride): def shortcut(input, ch_in, ch_out, stride):
...@@ -44,7 +51,7 @@ def resnet_cifar10(input, depth=32): ...@@ -44,7 +51,7 @@ def resnet_cifar10(input, depth=32):
def basicblock(input, ch_in, ch_out, stride): def basicblock(input, ch_in, ch_out, stride):
tmp = conv_bn_layer(input, ch_out, 3, stride, 1) tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None) tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
short = shortcut(input, ch_in, ch_out, stride) short = shortcut(input, ch_in, ch_out, stride)
return fluid.layers.elementwise_add(x=tmp, y=short, act='relu') return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
...@@ -219,11 +226,26 @@ def infer(use_cuda, save_dirname=None): ...@@ -219,11 +226,26 @@ def infer(use_cuda, save_dirname=None):
batch_size = 1 batch_size = 1
tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32") tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
# Use inference_transpiler to speedup
inference_transpiler_program = inference_program.clone()
t = fluid.InferenceTranspiler()
t.transpile(inference_transpiler_program, place)
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
results = exe.run(inference_program, results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img}, feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets) fetch_list=fetch_targets)
transpiler_results = exe.run(inference_transpiler_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
assert len(results[0]) == len(transpiler_results[0])
for i in range(len(results[0])):
np.testing.assert_almost_equal(
results[0][i], transpiler_results[0][i], decimal=6)
print("infer results: ", results[0]) print("infer results: ", results[0])
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# The fully connected test is removed whe the WITH_MKLDNN flag is OFF # The MKLDNN tests are skiped when the MKLDNN flag is OFF
# Because the fully connected layer has only one kernel (MKLDNN)
if(NOT WITH_MKLDNN) if(NOT WITH_MKLDNN)
list(REMOVE_ITEM TEST_OPS test_fc_op) foreach(src ${TEST_OPS})
if(${src} MATCHES ".*_mkldnn_op$")
list(REMOVE_ITEM TEST_OPS ${src})
endif()
endforeach()
endif(NOT WITH_MKLDNN) endif(NOT WITH_MKLDNN)
if(NOT WITH_DISTRIBUTE) if(NOT WITH_DISTRIBUTE)
...@@ -62,6 +65,7 @@ list(REMOVE_ITEM TEST_OPS test_registry) ...@@ -62,6 +65,7 @@ list(REMOVE_ITEM TEST_OPS test_registry)
list(REMOVE_ITEM TEST_OPS test_fetch_var) list(REMOVE_ITEM TEST_OPS test_fetch_var)
list(REMOVE_ITEM TEST_OPS test_parallel_op) list(REMOVE_ITEM TEST_OPS test_parallel_op)
list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input) list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
list(REMOVE_ITEM TEST_OPS test_dist_train)
# tests that can be bundled together in one python process for speed. # tests that can be bundled together in one python process for speed.
if(WITH_FAST_BUNDLE_TEST) if(WITH_FAST_BUNDLE_TEST)
...@@ -100,3 +104,4 @@ py_test_modules(test_registry MODULES test_registry) ...@@ -100,3 +104,4 @@ py_test_modules(test_registry MODULES test_registry)
py_test_modules(test_fetch_var MODULES test_fetch_var) py_test_modules(test_fetch_var MODULES test_fetch_var)
py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input) py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
py_test_modules(test_parallel_op MODULES test_parallel_op) py_test_modules(test_parallel_op MODULES test_parallel_op)
py_test_modules(test_dist_train MODULES test_dist_train)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from scipy.special import expit
from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
class TestMKLDNNReluDim2(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim2(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim2(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim2(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNReluDim4(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim4(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim4, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim4(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim4, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.sqrt(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim4(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x}
self.outputs = {'Out': np.abs(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
if __name__ == '__main__':
unittest.main()
...@@ -1098,82 +1098,5 @@ class TestFP16Swish(TestSwish): ...@@ -1098,82 +1098,5 @@ class TestFP16Swish(TestSwish):
self.check_output_with_place(place, atol=1e-3) self.check_output_with_place(place, atol=1e-3)
#--------------------test MKLDNN--------------------
class TestMKLDNNReluDim2(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim2(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim2(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim2(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNReluDim4(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim4(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim4, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim4(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim4, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.sqrt(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim4(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x}
self.outputs = {'Out': np.abs(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
class TestMKLDNN(TestConv2dOp):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == '__main__':
unittest.main()
...@@ -373,22 +373,5 @@ class TestDepthwiseConv2(TestConv2dOp): ...@@ -373,22 +373,5 @@ class TestDepthwiseConv2(TestConv2dOp):
# def init_op_type(self): # def init_op_type(self):
# self.op_type = "conv_cudnn" # self.op_type = "conv_cudnn"
#----------------Conv2dMKLDNN----------------
class TestMKLDNN(TestConv2dOp):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -15,31 +15,42 @@ ...@@ -15,31 +15,42 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import numpy import numpy
from multiprocessing import Process from multiprocessing import Process
from threading import Thread
import os, sys import os, sys
import time import time
class TestRecvOp(unittest.TestCase): class TestSendOp(unittest.TestCase):
def no_test_send(self): def test_send(self):
# Run init_serv in a thread # Run init_serv in a thread
place = fluid.CPUPlace() place = fluid.CPUPlace()
# NOTE: python thread will not work here due to GIL.
p = Process(target=self.init_serv, args=(place, )) p = Process(target=self.init_serv, args=(place, ))
p.daemon = True p.daemon = True
p.start() p.start()
time.sleep(1)
self.init_client(place) time.sleep(10)
with open("/tmp/paddle.selected_port", "r") as fn:
selected_port = int(fn.readlines()[0])
self.init_client(place, selected_port)
self.run_local(place)
self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
# FIXME(typhoonzero): find a way to gracefully shutdown the server. # FIXME(typhoonzero): find a way to gracefully shutdown the server.
os.system("kill -9 %d" % p.pid) os.system("kill -9 %d" % p.pid)
p.join() p.join()
def init_serv(self, place): def init_serv(self, place):
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
serv = layers.ListenAndServ( serv = layers.ListenAndServ(
"127.0.0.1:6174", ["X"], optimizer_mode=False) "127.0.0.1:0", ["X"], optimizer_mode=False)
with serv.do(): with serv.do():
x = layers.data( x = layers.data(
shape=[32, 32], shape=[32, 32],
...@@ -50,10 +61,29 @@ class TestRecvOp(unittest.TestCase): ...@@ -50,10 +61,29 @@ class TestRecvOp(unittest.TestCase):
o = layers.scale(x=x, scale=10.0) o = layers.scale(x=x, scale=10.0)
main.global_block().create_var( main.global_block().create_var(
name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape) name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
self.server_exe = fluid.Executor(place)
self.server_exe.run(main)
def init_client(self, place, port):
main = fluid.Program()
with fluid.program_guard(main):
x = layers.data(
shape=[32, 32],
dtype='float32',
name='X',
append_batch_size=False)
fluid.initializer.Constant(value=2.3)(x, main.global_block())
get_var = main.global_block().create_var(
name="scale_0.tmp_0", # server side var
dtype="float32",
persistable=False,
shape=[32, 32])
o = layers.Send("127.0.0.1:%d" % port, [x], [get_var])
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(main) self.dist_out = exe.run(main, fetch_list=o) # o is a list
def init_client(self, place): def run_local(self, place):
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
x = layers.data( x = layers.data(
...@@ -61,10 +91,10 @@ class TestRecvOp(unittest.TestCase): ...@@ -61,10 +91,10 @@ class TestRecvOp(unittest.TestCase):
dtype='float32', dtype='float32',
name='X', name='X',
append_batch_size=False) append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block()) fluid.initializer.Constant(value=2.3)(x, main.global_block())
layers.Send("127.0.0.1:6174", [x], [x]) o = layers.scale(x=x, scale=10.0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(main) self.local_out = exe.run(main, fetch_list=[o])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -350,6 +350,15 @@ class TestBook(unittest.TestCase): ...@@ -350,6 +350,15 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(smooth_label) self.assertIsNotNone(smooth_label)
print(str(program)) print(str(program))
def test_topk(self):
program = Program()
with program_guard(program):
data = layers.data(name="label", shape=[200], dtype="float32")
values, indices = layers.topk(data, k=5)
self.assertIsNotNone(values)
self.assertIsNotNone(indices)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test_lrn_op import TestLRNOp
class TestLRNMKLDNNOp(TestLRNOp):
def get_attrs(self):
attrs = TestLRNOp.get_attrs(self)
attrs['use_mkldnn'] = True
return attrs
def test_check_output(self):
self.check_output(atol=0.002)
class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
def get_attrs(self):
attrs = TestLRNMKLDNNOp.get_attrs(self)
attrs['is_test'] = True
return attrs
def test_check_grad_normal(self):
def check_raise_is_test():
try:
self.check_grad(['X'], 'Out', max_relative_error=0.01)
except Exception as e:
t = \
"is_test attribute should be set to False in training phase."
if t in str(e):
raise AttributeError
self.assertRaises(AttributeError, check_raise_is_test)
if __name__ == "__main__":
unittest.main()
...@@ -87,34 +87,5 @@ class TestLRNOp(OpTest): ...@@ -87,34 +87,5 @@ class TestLRNOp(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.01) self.check_grad(['X'], 'Out', max_relative_error=0.01)
class TestLRNMKLDNNOp(TestLRNOp):
def get_attrs(self):
attrs = TestLRNOp.get_attrs(self)
attrs['use_mkldnn'] = True
return attrs
def test_check_output(self):
self.check_output(atol=0.002)
class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
def get_attrs(self):
attrs = TestLRNMKLDNNOp.get_attrs(self)
attrs['is_test'] = True
return attrs
def test_check_grad_normal(self):
def check_raise_is_test():
try:
self.check_grad(['X'], 'Out', max_relative_error=0.01)
except Exception as e:
t = \
"is_test attribute should be set to False in training phase."
if t in str(e):
raise AttributeError
self.assertRaises(AttributeError, check_raise_is_test)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -200,34 +200,56 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -200,34 +200,56 @@ class TestParallelExecutorBase(unittest.TestCase):
def check_network_convergence(self, def check_network_convergence(self,
method, method,
memory_opt=True, memory_opt=True,
iter=10, iter=50,
batch_size=None, batch_size=None,
allow_op_delay=False, allow_op_delay=False,
feed_dict={}): feed_dict=None,
seed=None,
use_parallel_executor=True):
def run_executor(exe, feed, fetch_list, program=None):
if isinstance(exe, fluid.ParallelExecutor):
res = exe.run(fetch_list=fetch_list, feed=feed)
elif isinstance(exe, fluid.Executor):
if program is None:
program = fluid.default_main_program()
res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
else:
raise ValueError('Unkown type exe')
return res
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
startup.random_seed = 1 # Fix random seed
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
loss = method(use_feed=len(feed_dict) > 0) if seed is not None:
startup.random_seed = seed
loss = method(use_feed=feed_dict is not None)
adam = fluid.optimizer.Adam() adam = fluid.optimizer.Adam()
adam.minimize(loss) adam.minimize(loss)
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
startup_exe.run(startup) startup_exe.run(startup)
exe = fluid.ParallelExecutor(True, loss_name=loss.name) if use_parallel_executor:
exe = fluid.ParallelExecutor(
True, loss_name=loss.name, allow_op_delay=allow_op_delay)
else:
exe = fluid.Executor(place=place)
if batch_size is not None: if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count() batch_size *= fluid.core.get_cuda_device_count()
begin = time.time() begin = time.time()
first_loss, = exe.run([loss.name], feed_dict=feed_dict) first_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name])
first_loss = numpy.array(first_loss) first_loss = numpy.array(first_loss)
for i in xrange(iter): for i in xrange(iter):
exe.run([], feed_dict=feed_dict) run_executor(exe=exe, feed=feed_dict, fetch_list=[])
last_loss, = exe.run([loss.name], feed_dict=feed_dict) last_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name])
end = time.time() end = time.time()
if batch_size is not None: if batch_size is not None:
...@@ -238,6 +260,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -238,6 +260,7 @@ class TestParallelExecutorBase(unittest.TestCase):
print first_loss, last_loss print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0]) # self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
class TestMNIST(TestParallelExecutorBase): class TestMNIST(TestParallelExecutorBase):
...@@ -267,6 +290,27 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -267,6 +290,27 @@ class TestMNIST(TestParallelExecutorBase):
simple_fc_net, feed_dict={"image": img, simple_fc_net, feed_dict={"image": img,
"label": label}) "label": label})
def test_simple_fc_parallel_accuracy(self):
img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64')
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
feed_dict={"image": img,
"label": label},
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
feed_dict={"image": img,
"label": label},
use_parallel_executor=True)
for p_f in parallel_first_loss:
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
for p_l in parallel_last_loss:
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
self.check_network_convergence(fc_with_batchnorm) self.check_network_convergence(fc_with_batchnorm)
img = numpy.zeros(shape=[32, 784], dtype='float32') img = numpy.zeros(shape=[32, 784], dtype='float32')
...@@ -495,10 +539,10 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): ...@@ -495,10 +539,10 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
share_vars_from=train_exe) share_vars_from=train_exe)
for i in xrange(5): for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) test_loss, = test_exe.run([loss.name], feed=feed_dict)
test_loss = numpy.array(test_loss) test_loss = numpy.array(test_loss)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) train_loss, = train_exe.run([loss.name], feed=feed_dict)
train_loss = numpy.array(train_loss) train_loss = numpy.array(train_loss)
self.assertTrue( self.assertTrue(
numpy.allclose( numpy.allclose(
...@@ -648,5 +692,5 @@ class TestCRFModel(unittest.TestCase): ...@@ -648,5 +692,5 @@ class TestCRFModel(unittest.TestCase):
for i in xrange(10): for i in xrange(10):
cur_batch = next(data) cur_batch = next(data)
print map(numpy.array, print map(numpy.array,
pe.run(feed_dict=feeder.feed(cur_batch), pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))[0] fetch_list=[avg_cost.name]))[0]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
class TestMKLDNNCase1(TestPool2d_Op):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == '__main__':
unittest.main()
...@@ -317,36 +317,5 @@ class TestCeilModeCase4(TestCase2): ...@@ -317,36 +317,5 @@ class TestCeilModeCase4(TestCase2):
self.ceil_mode = True self.ceil_mode = True
#--------------------test pool2d MKLDNN--------------------
class TestMKLDNNCase1(TestPool2d_Op):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -50,7 +50,7 @@ An example implementation for single item data reader creator: ...@@ -50,7 +50,7 @@ An example implementation for single item data reader creator:
def reader(): def reader():
while True: while True:
yield numpy.random.uniform(-1, 1, size=width*height) yield numpy.random.uniform(-1, 1, size=width*height)
return reader return reader
An example implementation for multiple item data reader creator: An example implementation for multiple item data reader creator:
...@@ -60,7 +60,7 @@ An example implementation for multiple item data reader creator: ...@@ -60,7 +60,7 @@ An example implementation for multiple item data reader creator:
def reader(): def reader():
while True: while True:
yield numpy.random.uniform(-1, 1, size=width*height), label yield numpy.random.uniform(-1, 1, size=width*height), label
return reader return reader
TODO(yuyang18): Should we add whole design doc here? TODO(yuyang18): Should we add whole design doc here?
......
# AWS benchmark testing tool
This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
## Features
- subnet creation to fit just the amount of ec2 instances required.
- pserver and trainer ec2 instances allocation, and instance state verification
- nvidia-docker ready for GPU training
- Instances and network element garbage collection when a task is accomplished or an error occurred
- Test log is collected in realtime
- Web service for checking log or tearing down the testing setup
- No testing code change needed
- Lots of optional configuration options
## Usages
### Prerequisites
- You have a working AWS account
- You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
- Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
- You have key pair created and pem file downloaded.
- You have a default VPC in the region you want to run the test.
- You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
- If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
### Start a benchmark test
#### Create training image
*What to expect in this step:*
*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
```Dockerfile
FROM paddlepaddle/paddle:latest-gpu
ENV HOME /root
COPY ./ /root/
WORKDIR /root
RUN pip install -r /root/requirements.txt
ENTRYPOINT ["python", "my_training.py"]
```
***Please Note***
Training nodes will run your `ENTRYPOINT` script with the following environment variables:
- `TASK_NAME`: unique name to identify this training process.
- `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
- `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
- `PSERVERS`: same as above
- `TRAINERS`: trainer count
- `SERVER_ENDPOINT`: current server end point if the node role is a pserver
- `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
- `PADDLE_INIT_TRAINER_ID`: same as above
Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
```bash
docker build -t myreponname/paddle_benchmark .
```
Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
```bash
docker push myreponame/paddle_benchmark
```
#### Create instances and start training
*What to expect in this step*
*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
Now let's start the training process:
```bash
docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pare name>.pem \
putcn/paddle_aws_client \
--action create \
--key_name <your key pare name> \
--security_group_id <your security group id> \
--docker_image myreponame/paddle_benchmark \
--pserver_count 2 \
--trainer_count 2
```
Now just wait until you see this:
```
master server finished init process, visit http://XXX:XXX/status to check master log
```
That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
#### Post creation operations
To access the master log:
```bash
docker run -i -v $HOME/.aws:/root/.aws \
putcn/paddle_aws_client \
--action status \
--master_server_public_ip <master ip> \
--master_server_port <master port>
```
To tear down the training setup:
```bash
docker run -i -v $HOME/.aws:/root/.aws \
putcn/paddle_aws_client \
--action cleanup \
--master_server_public_ip <master ip> \
--master_server_port <master port>
```
To retrieve training logs
TBD
### Tech details
*What to expect in this step*
*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
Let's understand what is happening under the hood when you run above command in your laptop
![alt](diagram.png)
There are 4 roles in the figure above:
- client: your laptop
- master: who tasks to aws api server to create/tear down instances, and monitor training process
- AWS api server: the one who actually creates and manages instances
- pservers and trainers: training instances
When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
Master exposes 4 major services:
- GET `/status`: return master log
- GET `/logs`: return list of log file names
- GET `/log/<logfile name>`: return a particular log by log file name
- POST `/cleanup`: teardown the whole setup
### Parameters
TBD, please refer to client/cluster_launcher.py for now
### Trouble shooting
TBD
FROM python:2.7.14-stretch
ENV HOME /root
COPY ./ /root/
WORKDIR /root
RUN pip install -r /root/requirements.txt
ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
import math
import logging
import copy
import netaddr
import boto3
import namesgenerator
import paramiko
from scp import SCPClient
import requests
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--key_name', type=str, default="", help="required, key pair name")
parser.add_argument(
'--security_group_id',
type=str,
default="",
help="required, the security group id associated with your VPC")
parser.add_argument(
'--vpc_id',
type=str,
default="",
help="The VPC in which you wish to run test")
parser.add_argument(
'--subnet_id',
type=str,
default="",
help="The Subnet_id in which you wish to run test")
parser.add_argument(
'--pserver_instance_type',
type=str,
default="c5.2xlarge",
help="your pserver instance type, c5.2xlarge by default")
parser.add_argument(
'--trainer_instance_type',
type=str,
default="p2.8xlarge",
help="your trainer instance type, p2.8xlarge by default")
parser.add_argument(
'--task_name',
type=str,
default="",
help="the name you want to identify your job")
parser.add_argument(
'--pserver_image_id',
type=str,
default="ami-da2c1cbf",
help="ami id for system image, default one has nvidia-docker ready, \
use ami-1ae93962 for us-east-2")
parser.add_argument(
'--pserver_command', type=str, default="", help="pserver start command")
parser.add_argument(
'--trainer_image_id',
type=str,
default="ami-da2c1cbf",
help="ami id for system image, default one has nvidia-docker ready, \
use ami-1ae93962 for us-west-2")
parser.add_argument(
'--trainer_command', type=str, default="", help="trainer start command")
parser.add_argument(
'--availability_zone',
type=str,
default="us-east-2a",
help="aws zone id to place ec2 instances")
parser.add_argument(
'--trainer_count', type=int, default=1, help="Trainer count")
parser.add_argument(
'--pserver_count', type=int, default=1, help="Pserver count")
parser.add_argument(
'--action', type=str, default="create", help="create|cleanup|status")
parser.add_argument('--pem_path', type=str, help="private key file")
parser.add_argument(
'--pserver_port', type=str, default="5436", help="pserver port")
parser.add_argument(
'--docker_image', type=str, default="busybox", help="training docker image")
parser.add_argument(
'--master_server_port', type=int, default=5436, help="master server port")
parser.add_argument(
'--master_server_public_ip', type=str, help="master server public ip")
parser.add_argument(
'--master_docker_image',
type=str,
default="putcn/paddle_aws_master:latest",
help="master docker image id")
parser.add_argument(
'--no_clean_up',
type=str2bool,
default=False,
help="whether to clean up after training")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
ec2client = boto3.client('ec2')
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def create_subnet():
# if no vpc id provided, list vpcs
logging.info("start creating subnet")
if not args.vpc_id:
logging.info("no vpc provided, trying to find the default one")
vpcs_desc = ec2client.describe_vpcs(
Filters=[{
"Name": "isDefault",
"Values": ["true", ]
}], )
if len(vpcs_desc["Vpcs"]) == 0:
raise ValueError('No default VPC')
args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
logging.info("default vpc fount with id %s and CidrBlock %s" %
(args.vpc_id, vpc_cidrBlock))
if not vpc_cidrBlock:
logging.info("trying to find cidrblock for vpc")
vpcs_desc = ec2client.describe_vpcs(
Filters=[{
"Name": "vpc-id",
"Values": [args.vpc_id, ],
}], )
if len(vpcs_desc["Vpcs"]) == 0:
raise ValueError('No VPC found')
vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
# list subnets in vpc in order to create a new one
logging.info("trying to find ip blocks for new subnet")
subnets_desc = ec2client.describe_subnets(
Filters=[{
"Name": "vpc-id",
"Values": [args.vpc_id, ],
}], )
ips_taken = []
for subnet_dec in subnets_desc["Subnets"]:
ips_taken.append(subnet_dec["CidrBlock"])
ip_blocks_avaliable = netaddr.IPSet(
[vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
# adding 10 addresses as buffer
cidr_prefix = 32 - math.ceil(
math.log(args.pserver_count + args.trainer_count + 10, 2))
if cidr_prefix <= 16:
raise ValueError('Too many nodes to fit in current VPC')
for ipnetwork in ip_blocks_avaliable.iter_cidrs():
try:
subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
logging.info("subnet ip block found %s" % (subnet_cidr))
break
except Exception:
pass
if not subnet_cidr:
raise ValueError(
'No avaliable subnet to fit required nodes in current VPC')
logging.info("trying to create subnet")
subnet_desc = ec2client.create_subnet(
CidrBlock=str(subnet_cidr),
VpcId=args.vpc_id,
AvailabilityZone=args.availability_zone)
subnet_id = subnet_desc["Subnet"]["SubnetId"]
subnet_waiter = ec2client.get_waiter('subnet_available')
# sleep for 1s before checking its state
time.sleep(1)
subnet_waiter.wait(SubnetIds=[subnet_id, ])
logging.info("subnet created")
logging.info("adding tags to newly created subnet")
ec2client.create_tags(
Resources=[subnet_id, ],
Tags=[{
"Key": "Task_name",
'Value': args.task_name
}])
return subnet_id
def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
response = ec2client.run_instances(
ImageId=image_id,
InstanceType=instance_type,
MaxCount=count,
MinCount=count,
UserData=cmd,
DryRun=False,
InstanceInitiatedShutdownBehavior="stop",
KeyName=args.key_name,
Placement={'AvailabilityZone': args.availability_zone},
NetworkInterfaces=[{
'DeviceIndex': 0,
'SubnetId': args.subnet_id,
"AssociatePublicIpAddress": True,
'Groups': args.security_group_ids
}],
TagSpecifications=[{
'ResourceType': "instance",
'Tags': [{
"Key": 'Task_name',
"Value": args.task_name + "_master"
}, {
"Key": 'Role',
"Value": role
}]
}])
instance_ids = []
for instance in response["Instances"]:
instance_ids.append(instance["InstanceId"])
if len(instance_ids) > 0:
logging.info(str(len(instance_ids)) + " instance(s) created")
else:
logging.info("no instance created")
#create waiter to make sure it's running
logging.info("waiting for instance to become accessible")
waiter = ec2client.get_waiter('instance_status_ok')
waiter.wait(
Filters=[{
"Name": "instance-status.status",
"Values": ["ok"]
}, {
"Name": "instance-status.reachability",
"Values": ["passed"]
}, {
"Name": "instance-state-name",
"Values": ["running"]
}],
InstanceIds=instance_ids)
instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
return instances_response["Reservations"][0]["Instances"]
def generate_task_name():
return namesgenerator.get_random_name()
def init_args():
if not args.task_name:
args.task_name = generate_task_name()
logging.info("task name generated %s" % (args.task_name))
if not args.pem_path:
args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
if args.security_group_id:
args.security_group_ids = (args.security_group_id, )
def create():
init_args()
# create subnet
if not args.subnet_id:
args.subnet_id = create_subnet()
# create master node
master_instance_response = run_instances(
image_id="ami-7a05351f", instance_type="t2.nano")
logging.info("master server started")
args.master_server_public_ip = master_instance_response[0][
"PublicIpAddress"]
args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
logging.info("master server started, master_ip=%s, task_name=%s" %
(args.master_server_public_ip, args.task_name))
# cp config file and pems to master node
ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(
hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
with SCPClient(ssh_client.get_transport()) as scp:
scp.put(os.path.expanduser("~") + "/" + ".aws",
recursive=True,
remote_path='/home/ubuntu/')
scp.put(args.pem_path,
remote_path='/home/ubuntu/' + args.key_name + ".pem")
logging.info("credentials and pem copied to master")
# set arguments and start docker
kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
args.master_server_port)
kick_off_cmd += " " + args.master_docker_image
args_to_pass = copy.copy(args)
args_to_pass.action = "serve"
del args_to_pass.pem_path
del args_to_pass.security_group_ids
del args_to_pass.master_docker_image
del args_to_pass.master_server_public_ip
for arg, value in sorted(vars(args_to_pass).iteritems()):
if value:
kick_off_cmd += ' --%s %s' % (arg, value)
logging.info(kick_off_cmd)
stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
return_code = stdout.channel.recv_exit_status()
logging.info(return_code)
if return_code != 0:
raise Exception("Error while kicking off master")
logging.info(
"master server finished init process, visit %s to check master log" %
(get_master_web_url("/status")))
def cleanup():
print requests.post(get_master_web_url("/cleanup")).text
def status():
print requests.post(get_master_web_url("/status")).text
def get_master_web_url(path):
return "http://" + args.master_server_public_ip + ":" + str(
args.master_server_port) + path
if __name__ == "__main__":
print_arguments()
if args.action == "create":
if not args.key_name or not args.security_group_id:
raise ValueError("key_name and security_group_id are required")
create()
elif args.action == "cleanup":
if not args.master_server_public_ip:
raise ValueError("master_server_public_ip is required")
cleanup()
elif args.action == "status":
if not args.master_server_public_ip:
raise ValueError("master_server_public_ip is required")
status()
netaddr==0.7.19
boto3==1.6.21
namesgenerator==0.3
paramiko==2.4.1
scp
requests
FROM python:2.7.14-stretch
ENV HOME /root
COPY ./ /root/
WORKDIR /root
RUN pip install -r /root/requirements.txt
ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册