diff --git a/CMakeLists.txt b/CMakeLists.txt
index c649aafeddaf9f28c213d086236c3779d3137d92..23bbe829ac16180088bfa37df66e23f19b021ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,6 +179,7 @@ set(EXTERNAL_LIBS
if(WITH_GPU)
include(cuda)
+ include(tensorrt)
endif(WITH_GPU)
if(WITH_AMD_GPU)
diff --git a/Dockerfile b/Dockerfile
index 7856d3bbc492af4cad2d6b9f49001c90eadbea43..870304a6acc99e715dffbfabd8058be000b6872c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# install glide
RUN curl -s -q https://glide.sh/get | sh
+# Install TensorRT
+# The unnecessary files has been removed to make the library small. It only contains include and lib now.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+ tar -xz -C /usr/local && \
+ cp -rf /usr/local/TensorRT/include /usr && \
+ cp -rf /usr/local/TensorRT/lib /usr
+
# git credential to skip password typing
RUN git config --global credential.helper store
@@ -57,7 +64,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
pip install -U wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark
diff --git a/Dockerfile.android b/Dockerfile.android
index cc022d596b4b74dd1e4f4d0901dd81c91a7decd1..848a7eba6f1421432addae8acff407b611adb4ae 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
pip install -U 'protobuf==3.1.0' && \
pip install -U wheel sphinx && \
pip install pre-commit
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 52a22c1fbf4779fa3c0ca687cab664bd3ca0410a..e3b9d94215a858c5c9a34e1b7e97540f1876801d 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING)
/usr/lib/reference/
)
else()
- # Diable the finding of reference cblas under host's system path
+ # Disable the finding of reference cblas under host's system path
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f726405c4773994f6ca6509e5218750805b03995..e490397cc0624c310949a4b571bd00cac6e8953b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -80,6 +80,16 @@ if(WITH_GPU)
# Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+ if(TENSORRT_FOUND)
+ if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+ message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+ endif()
+ if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+ message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+ endif()
+ include_directories(${TENSORRT_INCLUDE_DIR})
+ endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index aa249159470773241e0f6da2e8e086264634dd4a..e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
- GIT_TAG "v1.11.x"
+ GIT_TAG "v1.10.x"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0c07d36bed65400164853b99f18ec0335341cd94
--- /dev/null
+++ b/cmake/tensorrt.cmake
@@ -0,0 +1,33 @@
+if(NOT WITH_GPU)
+ return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+ PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+ $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+ NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+ PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+ $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+ NO_DEFAULT_PATH
+ DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+ set(TENSORRT_FOUND ON)
+else()
+ set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+ file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+ string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+ "${TENSORRT_VERSION_FILE_CONTENTS}")
+ string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+ TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+ message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+ "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+endif()
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 7066637a7cb27b83724cb4030c29a1019981f52b..0f9521616952a2857222feab8c38fb480761ee2d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn
- paddle_fluid_docs paddle_fluid_docs_cn)
+ paddle_fluid_docs paddle_fluid_docs_cn
+ paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2)
add_subdirectory(fluid)
+add_subdirectory(mobile)
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 22e6fb13d7320986a60bc1ef5530187e0970c767..5c02886efd7d11e9520910526fb90ec01e123bae 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -473,6 +473,12 @@ multiplex
.. autofunction:: paddle.fluid.layers.multiplex
:noindex:
+label_smooth
+------------
+
+.. autofunction:: paddle.fluid.layers.label_smooth
+ :noindex:
+
ops
===
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
index 9aed3b059a1595ba3971d7d5acfc0d16a731584b..4f88e27bed722e9f2f535e368926fe49b4e72e56 100644
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an `
## Synchronize GPU Kernels
-The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
1. `OpHandle` will record `DeviceContext` that it is used.
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+
+
+1. Pserver
+ 1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+ 1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+ 1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+ 1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+ 1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+ 1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+ 1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+ 1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+ 1. Pserver will run the optimization block to optimize the parameters.
+ 1. After optimization, pserver set `barrier_condition_` to 1.
+ 1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+ 1. The trainer uses GetVariable to get all the parameters from pserver.
+ 1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+ 1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+
+
+1. Pserver:
+ 1. Each parameter has a queue to receive its gradient from trainers.
+ 1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+ 1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+ 1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+ 1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+ 2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a0835b761b69030ba30697e6e8863928efbf57f
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,58 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
+1. After the PServer node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
+so all the Trainers would get the same parameters.
+
+In the synchronously distributed training, there should be a `Barrier` to synchronise the
+parameters after the optimizing stage. The performance of a distributed training job would
+depend on the slowest node if there were hundreds or thousands of training nodes in a
+Job, the performance of synchronously distributed training might be very poor because of
+the slow node. So this design doc would introduce an approach to implement
+*asynchronously* distributed training in PaddlePaddle Fluid.
+
+## Design
+
+
+
+As the figure above, we describe a global view of asynchronously update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PServer
+instances and then send them.
+1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch latest parameter from PServer before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PServer instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index b123b756e2251c38f319e1aefa2cb04fd7a36b03..ad798003f560e7fb0e6db6083fdd152fd3417584 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,7 @@
.. toctree::
:maxdepth: 1
+ api_doc_std_cn.md
new_op_cn.md
new_op_kernel.md
use_eigen_cn.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 98988fc22dcedecdbcd67fb3bf761377bf046337..80c899a82fa452c5cd8f38dad89c15d3041b09e3 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,7 @@ Development
.. toctree::
:maxdepth: 1
+ api_doc_std_en.md
new_op_en.md
new_op_kernel.md
use_eigen_en.md
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b104a6318d474d6531670b8ac3569448774850c7
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,53 @@
+if(NOT DEFINED SPHINX_THEME)
+ set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+ set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+ "${BINARY_BUILD_DIR_EN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+ html
+ ${BINARY_BUILD_DIR_EN}
+ ${SPHINX_CACHE_DIR_EN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+ "${BINARY_BUILD_DIR_CN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+ html
+ ${BINARY_BUILD_DIR_CN}
+ ${SPHINX_CACHE_DIR_CN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8297316e8fbb2b8f41954030293feadbcd81295e
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+=====
+
+.. toctree::
+ :maxdepth: 1
+
+ cross_compiling_for_android_cn.md
+ cross_compiling_for_ios_cn.md
+ cross_compiling_for_raspberry_cn.md
\ No newline at end of file
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e0acdff0284e3bc84b2cc4a34a142ee01754f940
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+.. toctree::
+ :maxdepth: 1
+
+ cross_compiling_for_android_en.md
+ cross_compiling_for_ios_en.md
+ cross_compiling_for_raspberry_en.md
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 85b649b2937f6a281b9ee1fe7bae8101169f6102..897e41f79f4e3bb9cecbe7b42fc6c4fd3401d839 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,5 +1,5 @@
cc_library(var_handle SRCS var_handle.cc DEPS place)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
+
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
+
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+ device_context broadcast_op_handle)
+cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+ device_context gather_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d29012380e1b1710704d71a28d21dcc3097eb51
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+Tensor *GetTensorFromVar(Variable *in_var) {
+ if (in_var->IsType()) {
+ return in_var->GetMutable();
+ } else if (in_var->IsType()) {
+ return in_var->GetMutable()->mutable_value();
+ } else {
+ PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+ }
+ return nullptr;
+}
+
+BroadcastOpHandle::BroadcastOpHandle(const std::vector &local_scopes,
+ const std::vector &places)
+ : local_scopes_(local_scopes), places_(places) {}
+
+void BroadcastOpHandle::RunImpl() {
+ // the input may have dummy var.
+ std::vector in_var_handle;
+ for (auto *in : inputs_) {
+ auto *out_handle = dynamic_cast(in);
+ if (out_handle) {
+ in_var_handle.push_back(out_handle);
+ }
+ }
+ PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
+ "The number of input should be one.");
+
+ // the output may have dummy var.
+ std::vector out_var_handles;
+ for (auto *out : outputs_) {
+ auto *out_handle = dynamic_cast(out);
+ if (out_handle) {
+ out_var_handles.push_back(out_handle);
+ }
+ }
+
+ PADDLE_ENFORCE_EQ(
+ out_var_handles.size(), places_.size(),
+ "The number of output should equal to the number of places.");
+
+ // Wait input done, this Wait is asynchronous operation
+ auto &in_place = in_var_handle[0]->place_;
+ if (in_var_handle[0]->generated_op_) {
+ for (auto *out : out_var_handles) {
+ auto &out_p = out->place_;
+ in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+ }
+ }
+
+ //
+ auto in_scope_idx = in_var_handle[0]->scope_idx_;
+ auto in_var =
+ local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
+ Tensor *in_tensor = GetTensorFromVar(in_var);
+
+ for (auto *out : out_var_handles) {
+ auto &out_p = out->place_;
+ auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
+
+ PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+ "Places must be all on CPU or all on CUDA.");
+
+ if (in_var->IsType()) {
+ auto &in_sr = in_var->Get();
+ auto out_sr = out_var->GetMutable();
+ if (&in_sr == out_sr) continue;
+ out_sr->set_height(in_sr.height());
+ out_sr->set_rows(in_sr.rows());
+ out_sr->mutable_value()->Resize(in_sr.value().dims());
+ out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
+ } else if (in_var->IsType()) {
+ auto in_lod = in_var->Get();
+ auto out_lod = out_var->GetMutable();
+ if (&in_lod == out_lod) continue;
+ out_lod->set_lod(in_lod.lod());
+ out_lod->Resize(in_lod.dims());
+ out_lod->mutable_data(out_p, in_lod.type());
+ } else {
+ PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
+ }
+
+ Tensor *out_tensor = GetTensorFromVar(out_var);
+ paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+ out_tensor);
+ }
+}
+
+std::string BroadcastOpHandle::Name() const { return "broadcast"; }
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3292422522b64a38a50f39f04e6f0d2e15492dd
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include