提交 06f74ae6 编写于 作者: S Superjomn

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fea/docker_cudnn7

......@@ -139,9 +139,6 @@ def run_benchmark(model, args):
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
# Optimization
opt = fluid.optimizer.AdamOptimizer(
......@@ -161,7 +158,7 @@ def run_benchmark(model, args):
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.average.WeightedAverage()
accuracy = fluid.metrics.Accuracy()
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
accuracy.reset()
......@@ -184,7 +181,7 @@ def run_benchmark(model, args):
"label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.add(value=outs[1], weight=outs[2])
accuracy.update(value=outs[1], weight=outs[2])
iters += 1
num_samples += len(y_data)
loss = np.array(outs[0])
......
......@@ -62,29 +62,33 @@ endif()
## Then find the reference-cblas. www.netlib.org/blas/
set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
"Folder contains reference-cblas")
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/include
/usr/include
/usr/include/cblas
)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/lib
/usr/lib
/usr/lib/blas/reference/
/usr/lib/reference/
)
if(NOT CMAKE_CROSSCOMPILING)
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/include
/usr/include
/usr/include/cblas
)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/lib
/usr/lib
/usr/lib/blas/reference/
/usr/lib/reference/
)
else()
# Diable the finding of reference cblas under host's system path
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
endif()
find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER REFERENCE)
set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
......
......@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
IF(APPLE)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE()
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
ENDIF()
ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
GIT_TAG "v1.8.x"
GIT_TAG "v1.11.x"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
if(WITH_DSO)
# If we use DSO, we do not build nccl, just download the dependencies
set(NCCL_BUILD_COMMAND "")
set(NCCL_INSTALL_COMMAND "")
set(NCCL_INSTALL_DIR "")
else()
# otherwise, we build nccl and link it.
set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
# Note: cuda 8.0 is needed to make nccl
# When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
set(NCCL_BUILD_COMMAND "make -j 8")
set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}")
endif()
ExternalProject_Add(
extern_nccl
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git"
GIT_TAG "v1.3.4-1"
PREFIX "${NCCL_SOURCE_DIR}"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
INSTALL_DIR "${NCCL_INSTALL_DIR}"
TEST_COMMAND ""
)
if(WITH_DSO)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
add_library(nccl STATIC ${dummyfile})
else()
add_library(nccl INTERFACE)
endif()
else()
add_library(nccl STATIC IMPORTED GLOBAL)
set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
endif()
add_dependencies(nccl extern_nccl)
......@@ -11,19 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
if(MOBILE_INFERENCE OR RPI)
return()
ENDIF()
endif()
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
ExternalProject_Add(
extern_snappy
......@@ -51,8 +52,7 @@ ExternalProject_Add(
)
add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
"${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
include_directories(${SNAPPY_INCLUDE_DIR})
add_dependencies(snappy extern_snappy)
......@@ -11,9 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
IF(MOBILE_INFERENCE OR RPI)
return()
ENDIF()
......@@ -21,9 +20,11 @@ include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
ExternalProject_Add(
extern_snappystream
......@@ -51,8 +52,7 @@ ExternalProject_Add(
)
add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
......
......@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif()
......@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
......@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
)
endif()
if(NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
endif()
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
......
......@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
## Turing Completeness
......
......@@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
不使用PaddlePaddle.org工具
--------------------------
使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似,通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行,在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档,具体步骤如下:
[TBD]
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
docker build -t paddle:dev .
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
# 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
bash -x /paddle/paddle/scripts/docker/build.sh
注:上述命令把当前目录(源码根目录)映射为 container 里的 :code:`/paddle` 目录。
编译完成后,会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录,分别进入这些目录下,执行以下命令:
.. code-block:: bash
python -m SimpleHTTPServer 8088
在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即
.. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# 如果只需要构建使用文档,则执行以下命令
make -j $processors gen_proto_py
make -j $processors paddle_docs paddle_docs_cn
make -j $processors paddle_docs
# 如果只需要构建API,则执行以下命令
make -j $processors gen_proto_py framework_py_proto
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
make -j $processors paddle_apis
其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。
编译完成后,进入 ``doc/v2`` 目录,如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令:
编译完成后,同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录,分别进入这些子目录下,执行以下命令:
.. code-block:: bash
python -m SimpleHTTPServer 8088
在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
.. image:: src/doc_en.png
:align: center
......
......@@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
Manually Building the Documentation
-------------------------------------
Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
[TBD]
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# Construct a docker image from source code
docker build -t paddle:dev .
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
# Use build.sh to build PaddlePaddle documentation
bash -x /paddle/paddle/scripts/docker/build.sh
Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
.. code-block:: bash
python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
.. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# If you only need to build documents, use the following commands
make -j $processors gen_proto_py
make -j $processors paddle_docs paddle_docs_cn
make -j $processors paddle_docs
# If you only need to build APIs, use the following commands
make -j $processors gen_proto_py framework_py_proto
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
make -j $processors paddle_apis
$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs,it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html`` will be generated in both two directories. If you chose to build APIs,a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
.. code-block:: bash
python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
.. image:: src/doc_en.png
:align: center
......
......@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
endif()
add_subdirectory(testing)
if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
if(NOT MOBILE_INFERENCE AND NOT RPI)
add_subdirectory(fluid)
endif()
......@@ -3,6 +3,7 @@ add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(inference)
add_subdirectory(string)
add_subdirectory(recordio)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
......@@ -79,14 +79,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table feed_fetch_method)
framework_proto glog lod_rank_table feed_fetch_method)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/operators/net_op.h"
#include <deque>
#include <list>
#include <memory>
#include <unordered_set>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/net_op.h"
namespace paddle {
namespace framework {
static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
// Control Flow operators's backward is significantly different from
// computational operators. Hack Code here.
// We should design a better way to backward CtrlFlowOps.
static std::unordered_set<std::string>& CtrlFlowOps() {
if (g_ctrl_flow_ops_ == nullptr) {
g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
"increment", "lod_rank_table", "less_than"};
}
return *g_ctrl_flow_ops_;
}
static inline std::unique_ptr<OperatorBase> CreateGradOp(
const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
std::unordered_map<std::string, std::string>* grad_to_var) {
OpDesc op_desc;
op_desc.SetInputMap(op.Inputs());
op_desc.SetOutputMap(op.Outputs());
op_desc.SetType(op.Type());
op_desc.SetAttrMap(op.Attrs());
auto& info = OpInfoMap::Instance().Get(op.Type());
auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
std::vector<std::unique_ptr<OperatorBase>> grad_ops;
grad_ops.reserve(grad_descs.size());
std::transform(grad_descs.begin(), grad_descs.end(),
std::back_inserter(grad_ops),
[](const std::unique_ptr<OpDesc>& grad_desc) {
return OpRegistry::CreateOp(*grad_desc);
});
PADDLE_ENFORCE(!grad_ops.empty());
if (grad_ops.size() == 1) {
return std::move(grad_ops[0]);
} else {
auto net_op = new operators::NetOp();
for (auto& grad_op : grad_ops) {
net_op->AppendOp(std::move(grad_op));
}
net_op->CompleteAddOp();
return std::unique_ptr<OperatorBase>(net_op);
}
}
template <typename Map, typename T>
static void ForEachVarName(const Map& names, T callback) {
for (auto& name : names) {
for (auto& n : name.second) {
if (callback(n)) return;
}
}
}
// return whether all the names + suffixes in the set
static bool AllInSet(
const std::map<std::string, std::vector<std::string>>& names,
const std::string& suffix, const std::unordered_set<std::string>& set) {
bool all_in_set = true;
ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
all_in_set = set.find(n + suffix) != set.end();
return !all_in_set;
});
return all_in_set;
}
static std::unique_ptr<OperatorBase> NOP() {
auto net_op = new operators::NetOp();
net_op->SetType("@NOP@");
net_op->CompleteAddOp();
return std::unique_ptr<OperatorBase>(net_op);
}
// Get backward operator from a forward operator, a recursive implementation.
//
// no_grad_names the gradient variable names without gradient calculating.
//
// uniq_id is a unique index used inside recursively calling
// BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
// pass `uniq_id` through recursive calling.
//
// returns The backward operator. In a simple situation, it may be a simple
// operator, in a complex situation, it maybe a NetOp.
//
// See Backward.h for details
static std::unique_ptr<OperatorBase> BackwardRecursive(
const OperatorBase& forwardOp,
std::unordered_set<std::string>& no_grad_names,
std::unordered_map<std::string, std::string>* grad_to_var,
size_t& uniq_id) {
// If all input gradients of forwarding operator do not need to calculate,
// just return an NOP. Not return null ptr because NOP does not take
// too much time for calculation, but it is useful for simplifying logic.
if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
no_grad_names /*set*/)) {
return NOP();
}
// All output gradients of forwarding operator do not need to calculate.
// Then all input gradients cannot be computed at all, and we put them into
// `no_grad_names` set. Return an NOP.
if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
no_grad_names /*set*/)) {
ForEachVarName(forwardOp.Inputs(),
[&no_grad_names](const std::string& name) -> bool {
no_grad_names.insert(GradVarName(name));
return false;
});
return NOP();
}
// Returned gradient network
auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
if (forwardOp.IsNetOp()) {
// Because forwardOp is a net op, it can static_cast.
auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
// Map from output gradient variable name to operator's indices in
// backward net's ops_. That operator generates that variable.
std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
size_t local_op_id = 0;
// reversely travel forwardNet and collect all duplicate outputs.
for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
++it, ++local_op_id) {
auto& fwd = *it;
auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
ForEachVarName(bwd->Outputs(),
[&dup_output_ops, local_op_id](const std::string& out) {
dup_output_ops[out].emplace_back(local_op_id);
return false;
});
net->AppendOp(std::move(bwd));
}
// Get unique ID for this method.
auto uid = uniq_id++;
// TODO(dzh): more comment
// multiple operators which have the same output (y for example) may
// overwrite the same y variable when backward, special operations are token
// to handle this case. For each duplicate output, rename it to an alias
// (original name with a offset), append an `add` op for its operator,
// and finally sum all the alias variable to the final output variable y.
using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
std::list<Pos> insert_position;
for (auto& dup_output_op : dup_output_ops) {
const std::string& name = dup_output_op.first;
// duplicate @Empty@ don't need to be added
if (name == kEmptyVarName) continue;
auto& dup_op = dup_output_op.second;
// no duplicate output
if (dup_op.size() == 1) continue;
// process the duplicate outputs
std::vector<std::string> dup_outputs;
for (size_t i = 0; i < dup_op.size(); ++i) {
// rename each duplicate output to an alias
auto op_offset = dup_op[i];
dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
std::to_string(i));
net->ops_[op_offset]->Rename(name, dup_outputs.back());
}
// collect all the offset for each alias,
// insert a sum operator to add all aliases to output
insert_position.push_back(
{dup_op.back(),
OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
AttributeMap{})});
}
// make sure the inserted `sum` ops follow the BFS order.
insert_position.sort(
[](const Pos& l, const Pos& r) { return l.first > r.first; });
for (auto& pos : insert_position) {
net->InsertOp(pos.first + 1, std::move(pos.second));
}
} else {
std::unique_ptr<OperatorBase> grad_op(
CreateGradOp(forwardOp, no_grad_names, grad_to_var));
ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
const std::string& grad_input) {
if (no_grad_names.count(grad_input)) {
// +1 for \0
std::string prefix = grad_input.substr(
0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
// If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient.
net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
{{"Out", {grad_input}}},
AttributeMap{}));
}
return false;
});
ForEachVarName(grad_op->Outputs(),
[&no_grad_names, &grad_op](const std::string& grad_output) {
if (no_grad_names.count(grad_output)) {
grad_op->Rename(grad_output, kEmptyVarName);
}
return false;
});
if (net->ops_.empty()) { // Current no aux op is added to network
return grad_op;
}
net->AppendOp(std::move(grad_op));
}
net->SetType("@GENERATED_BACKWARD@");
net->CompleteAddOp();
return std::unique_ptr<OperatorBase>(
static_cast<OperatorBase*>(net.release()));
}
// See header for comments
std::unique_ptr<OperatorBase> Backward(
const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars) {
std::unordered_set<std::string> no_grad_names;
no_grad_names.reserve(no_grad_vars.size() + 1);
no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
for (auto& name : no_grad_vars) {
no_grad_names.insert(name + kGradVarSuffix);
}
size_t uid = 0;
std::unordered_map<std::string, std::string> grad_to_var;
return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
}
// ==================================== //
static bool AllGradInSet(const std::vector<std::string>& names,
const std::unordered_set<std::string>& set) {
for (const std::string& name : names) {
if (!set.count(GradVarName(name))) {
return false;
}
}
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "All input {";
for (auto& name : names) {
sout << name << ",";
}
sout << "} is in {";
for (auto& name : set) {
sout << name << ",";
}
sout << "}";
VLOG(10) << sout.str();
}
return true;
}
static std::string FwdName(const std::string& grad_name) {
auto pos = grad_name.find("@GRAD");
if (pos == std::string::npos) {
return "";
} else {
return grad_name.substr(0, pos);
}
}
static void CreateGradVarInBlock(
size_t grad_op_start_index,
const std::unordered_map<std::string, std::string>& param_name_map,
BlockDesc* block_desc,
std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
auto ops = block_desc->AllOps();
for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) {
std::unordered_set<std::string> new_vars;
auto& ctrl_flow_ops = CtrlFlowOps();
ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) {
if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
ctrl_flow_ops.end()) {
if (block_desc->HasVarRecursive(grad_var_name)) {
return false;
}
} else {
if (block_desc->HasVar(grad_var_name)) {
return false;
}
}
if (grad_var_name == framework::kEmptyVarName) {
return false;
}
auto var = block_desc->Var(grad_var_name);
VLOG(10) << "Creating Variable " << grad_var_name;
new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name);
if (it == param_name_map.end()) {
return false;
}
auto param_var_name = it->second;
auto& grad_record = (*grad_var_record)[param_var_name];
grad_record.name_ = grad_var_name;
grad_record.block_idx_ = block_desc->ID();
grad_record.op_idx_ = static_cast<int>(op_index);
return false; /* not break */
});
ops[op_index]->InferVarType(block_desc);
for (auto& arg : ops[op_index]->OutputArgumentNames()) {
if (new_vars.find(arg) == new_vars.end()) {
continue;
}
auto pname = FwdName(arg);
auto* param = block_desc->FindVarRecursive(pname);
auto* grad = block_desc->FindVar(arg);
if (param == nullptr) {
grad->SetDataType(proto::VarType::FP32);
} else {
grad->SetDataType(param->GetDataType());
}
}
ops[op_index]->InferShape(*block_desc);
}
}
std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var,
const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
// All input gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
if (AllGradInSet(inputs, *no_grad_vars)) {
VLOG(10) << "Drop operator " << op_desc->Type();
return grad_op_descs; // empty vector
}
// All output gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
if (AllGradInSet(outputs, *no_grad_vars)) {
VLOG(10) << "Drop operator " << op_desc->Type();
// FIXME: Hack code here
auto& ctrl_flow_ops = CtrlFlowOps();
if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
// Only computational op need drop input's gradient.
for (const std::string& name : inputs) {
no_grad_vars->insert(GradVarName(name));
VLOG(10) << " Also drop " << GradVarName(name);
}
}
return grad_op_descs; // empty vector
}
grad_op_descs =
OpInfoMap::Instance()
.Get(op_desc->Type())
.GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
for (auto& desc : grad_op_descs) {
for (const std::string& in_name : desc->InputArgumentNames()) {
if (no_grad_vars->count(in_name)) {
std::string prefix = in_name.substr(
0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
std::string new_name = prefix + kZeroVarSuffix;
desc->Rename(in_name, new_name);
std::unique_ptr<OpDesc> fill_zeros_op(
new OpDesc("fill_zeros_like", {{"X", {prefix}}},
{{"Out", {new_name}}}, AttributeMap{}));
pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
}
}
}
for (auto& p : pending_fill_zeros_ops) {
grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
}
return grad_op_descs;
}
static BlockDesc* CreateStepBlock(
ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var,
int step_block_idx);
std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
ProgramDesc& program_desc, int block_idx,
std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var) {
VLOG(5) << "MakeBlockBackward";
BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
std::vector<OpDesc*> op_descs = cur_block->AllOps();
std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
size_t grad_desc_idx = 0;
std::vector<std::unique_ptr<OpDesc>> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDesc>> op_grads;
if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
(*it)->Type() == "parallel_do") {
int step_block_idx = (*it)->GetBlockAttr("sub_block");
BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
grad_to_var, step_block_idx);
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
} else if ((*it)->Type() == "conditional_block") {
BlockDesc* backward_block =
CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
(*it)->GetBlockAttr("sub_block"));
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
} else {
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
}
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "Made ";
for (auto& op_grad : op_grads) {
sout << op_grad->Type() << " ";
}
VLOG(10) << sout.str();
}
for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) {
if (out_name.find("@GRAD") == std::string::npos) {
// Not all outputs of a backward operator is a gradient. Only gradient
// need to be sum. Skip variables are not gradient.
continue;
}
dup_out_ops[out_name].emplace_back(grad_desc_idx);
}
++grad_desc_idx;
}
std::transform(op_grads.begin(), op_grads.end(),
std::back_inserter(backward_descs),
[](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
}
VLOG(5) << "Appending Sums";
// Check whether some variables are written more than once
std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
for (const auto& dup : dup_out_ops) {
const std::string& out_name = dup.first;
const std::vector<size_t> dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector<std::string> sum_op_inputs;
std::string next_g_name = out_name;
for (size_t i = 0; i < dup_op.size(); ++i) {
VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
<< " duplicated";
std::string new_name = out_name + "@RENAME@" + std::to_string(i);
backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
sum_op_inputs.emplace_back(new_name);
next_g_name = sum_op_inputs.back();
}
std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
{{"Out", {out_name}}},
AttributeMap{}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
}
}
pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
return a.first > b.first;
});
for (auto& p : pending_sum_ops) {
backward_descs.insert(backward_descs.begin() + p.first + 1,
std::move(p.second));
}
VLOG(5) << "MakeBlockBackward Finished";
return backward_descs;
}
static BlockDesc* CreateStepBlock(
ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var,
int step_block_idx) {
auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
no_grad_vars, grad_to_var);
BlockDesc* backward_block =
program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
for (auto& ptr : backward_block_op_descs) {
backward_block->AppendAllocatedOp(move(ptr));
}
return backward_block;
}
ParamGradInfoMap AppendBackward(
ProgramDesc& program_desc, const VarDesc& target,
const std::unordered_set<std::string>& no_grad_vars) {
std::unordered_set<std::string> no_grad_var_names;
no_grad_var_names.reserve(no_grad_vars.size() + 1);
no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
for (auto& name : no_grad_vars) {
no_grad_var_names.insert(GradVarName(name));
}
const int root_block_idx = 0;
auto root_block = program_desc.MutableBlock(root_block_idx);
std::string fill_one_op_out = GradVarName(target.Name());
bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
PADDLE_ENFORCE(is_scalar, "target should be scalar");
VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType();
std::unique_ptr<OpDesc> fill_one_op(
new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
{{"shape", std::vector<int>{1}},
{"value", static_cast<float>(1.0)},
{"dtype", target.GetDataType()}}));
// infer var type of fill_one_op
fill_one_op->InferVarType(root_block);
root_block->AppendAllocatedOp(std::move(fill_one_op));
size_t forward_op_num = root_block->OpSize();
size_t forward_block_num = program_desc.Size();
// Insert backward operators
std::unordered_map<std::string, std::string> grad_to_var;
auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
&no_grad_var_names, &grad_to_var);
for (auto& ptr : backward_op_descs) {
root_block->AppendAllocatedOp(std::move(ptr));
}
// Create Variable
// Create target gradient variable
std::unordered_map<std::string, GradVarInfo> retv;
auto var = root_block->Var(fill_one_op_out);
var->SetDataType(target.GetDataType());
var->SetShape(target.GetShape());
auto& target_grad = retv[target.Name()];
target_grad.name_ = fill_one_op_out;
target_grad.block_idx_ = root_block_idx;
target_grad.op_idx_ = static_cast<int>(forward_op_num);
// create grad_var for all blocks in this program
CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
for (size_t block_index = forward_block_num;
block_index < program_desc.Size(); ++block_index) {
CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
&retv);
}
return retv;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
// Create the backward operator from a forward operator.
// TODO(yuyang18): Add more API reference comment.
extern std::unique_ptr<OperatorBase> Backward(
const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars);
struct GradVarInfo {
GradVarInfo() {}
GradVarInfo(const std::string& name, int block_idx, int op_idx)
: name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
bool operator==(const GradVarInfo& b) const {
return name_ == b.name_ && block_idx_ == b.block_idx_ &&
op_idx_ == b.op_idx_;
}
std::string name_;
int block_idx_;
int op_idx_;
};
using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
GradVarInfo /*grad_var_info*/>;
ParamGradInfoMap AppendBackward(
ProgramDesc& program_desc, const VarDesc& target,
const std::unordered_set<std::string>& no_grad_vars);
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/backward.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/net_op.h"
USE_NO_KERNEL_OP(fill_constant);
namespace paddle {
namespace framework {
using DeviceContext = platform::DeviceContext;
class NoneOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {}
};
template <typename Place, typename T>
class NoneKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {}
};
class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
public:
RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input X of Add");
AddInput("b", "Bias of Add");
AddOutput("Out", "Out of Add");
AddComment("Add Op");
}
};
class RowWiseAddGradMaker : public SingleGradOpDescMaker {
public:
using SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<OpDesc> Apply() const override {
auto grad_op = new OpDesc();
grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
grad_op->SetType("rowwise_add_grad");
return std::unique_ptr<OpDesc>(grad_op);
}
};
class MulOpMaker : public OpProtoAndCheckerMaker {
public:
MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "A");
AddInput("Y", "B");
AddOutput("Out", "Out");
AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
AddComment("Mul");
}
};
class SigmoidOpMaker : public OpProtoAndCheckerMaker {
public:
SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "X");
AddOutput("Out", "Y");
AddComment("Sigmoid");
}
};
class NoGradOpMaker : public OpProtoAndCheckerMaker {
public:
NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "X input");
AddOutput("Out", "Y output");
AddComment("NoGradOp, same input output. no Grad");
}
};
class FcOp : public operators::NetOp {
public:
FcOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AppendOp(OpRegistry::CreateOp(
"mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
{{"Out", {Output("mul_result")}}}, AttributeMap{}));
auto input_b = Inputs("b");
std::string before_act = "mul_result";
if (input_b.size() != 0) {
AppendOp(OpRegistry::CreateOp(
"rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
{{"Out", {Output("add_result")}}}, AttributeMap{}));
before_act = "add_result";
} else {
auto out_varname = Output("add_result");
if (out_varname != kEmptyVarName) {
this->Rename(out_varname, kEmptyVarName);
}
}
AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
{{"Out", {Output("Out")}}}, AttributeMap{}));
CompleteAddOp(false);
}
};
class FcOpMaker : public OpProtoAndCheckerMaker {
public:
FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x");
AddInput("W", "w");
AddInput("b", "b");
AddOutput("mul_result", "").AsIntermediate();
AddOutput("add_result", "").AsIntermediate();
AddOutput("Out", "");
AddComment("");
}
};
class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
public:
ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("x", "x");
AddOutput("y", "y");
AddOutput("z", "z");
AddComment("");
}
};
class FillZeroOpMaker : public OpProtoAndCheckerMaker {
public:
FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x");
AddOutput("Out", "out");
AddComment("");
}
};
class SumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensors of sum operator.").AsDuplicable();
AddOutput("Out", "the output tensor of sum operator.");
AddComment("");
}
};
class MultInOutOpMaker : public OpProtoAndCheckerMaker {
public:
MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x");
AddInput("H", "h");
AddOutput("Y", "y");
AddOutput("Z", "z");
AddComment("");
}
};
class MinusGradOpDescMaker : public GradOpDescMakerBase {
public:
using GradOpDescMakerBase::GradOpDescMakerBase;
std::vector<std::unique_ptr<OpDesc>> operator()() const override {
std::vector<std::unique_ptr<OpDesc>> retv;
auto x_g = InputGrad("X");
if (!x_g.empty()) {
auto *op_desc = new OpDesc();
op_desc->SetType("scale");
op_desc->SetInput("X", OutputGrad("Out"));
op_desc->SetOutput("Out", x_g);
op_desc->SetAttr("scale", 1.0f);
retv.emplace_back(op_desc);
}
auto y_g = InputGrad("Y");
if (!y_g.empty()) {
auto *op_desc = new OpDesc();
op_desc->SetType("scale");
op_desc->SetInput("X", OutputGrad("Out"));
op_desc->SetOutput("Out", y_g);
op_desc->SetAttr("scale", -1.0f);
retv.emplace_back(op_desc);
}
return retv;
}
};
class MinusOpMaker : public OpProtoAndCheckerMaker {
public:
MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "");
AddInput("Y", "");
AddOutput("Out", "");
AddComment("minus for unittest");
}
};
} // namespace framework
} // namespace paddle
namespace f = paddle::framework;
namespace ops = paddle::operators;
using EnforceNotMet = paddle::platform::EnforceNotMet;
// rowwise_add
REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
f::RowWiseAddGradMaker);
REGISTER_OP_CPU_KERNEL(rowwise_add,
f::NoneKernel<paddle::platform::CPUPlace, float>);
REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
f::NoneKernel<paddle::platform::CPUPlace, float>);
// mul
REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad,
f::NoneKernel<paddle::platform::CPUPlace, float>);
// sigmoid
REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
REGISTER_OP_CPU_KERNEL(sigmoid,
f::NoneKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
// fill_zeros_like
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
REGISTER_OP_CPU_KERNEL(fill_zeros_like,
f::NoneKernel<paddle::platform::CPUPlace, float>);
// sum
REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(sum_grad,
f::NoneKernel<paddle::platform::CPUPlace, float>);
// fc
REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
// many_output_op
REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
many_output_op_grad, f::NoneOp);
// mult_in_out
REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
f::NoneOp);
REGISTER_OP_CPU_KERNEL(mult_in_out,
f::NoneKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
f::NoneKernel<paddle::platform::CPUPlace, float>);
// minus
REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
// scale
REGISTER_OPERATOR(scale, f::NoneOp);
REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
TEST(Backward, simple_op_not_need_grad) {
auto fwd =
f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
{{"Out", {"out"}}}, f::AttributeMap{});
ASSERT_NE(fwd, nullptr);
auto gop = f::Backward(*fwd, {"x"});
ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
auto no_input_gop = f::Backward(*fwd, {"x", "b"});
ASSERT_NE(no_input_gop, nullptr);
ASSERT_TRUE(no_input_gop->IsNetOp());
ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
}
TEST(Backward, net_fc_backward_normal) {
std::shared_ptr<f::OperatorBase> fwd =
f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
{{"mul_result", {"mul_res"}},
{"add_result", {"add_re"}},
{"Out", {"out"}}},
f::AttributeMap{});
ASSERT_NE(fwd, nullptr);
std::shared_ptr<f::OperatorBase> gop =
f::Backward(*fwd, std::unordered_set<std::string>{});
ASSERT_TRUE(gop->IsNetOp());
auto net = static_cast<ops::NetOp *>(gop.get());
ASSERT_NO_THROW(net->DebugString());
ASSERT_EQ(3UL, net->ops_.size());
f::OperatorBase &d_sigmoid = *net->ops_[0];
ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
f::OperatorBase &d_add = *net->ops_[1];
ASSERT_EQ("rowwise_add_grad", d_add.Type());
f::OperatorBase &d_mul = *net->ops_[2];
ASSERT_EQ("mul_grad", d_mul.Type());
}
TEST(Backward, net_fc_backward_not_have_b) {
std::shared_ptr<f::OperatorBase> fwd =
f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
{{"mul_result", {"mul_res"}},
{"add_result", {"add_res"}},
{"Out", {"tmp"}}},
f::AttributeMap{});
ASSERT_NE(fwd, nullptr);
std::shared_ptr<f::OperatorBase> gop =
f::Backward(*fwd, std::unordered_set<std::string>{});
ASSERT_TRUE(gop->IsNetOp());
auto net = static_cast<ops::NetOp *>(gop.get());
ASSERT_NO_THROW(net->DebugString());
ASSERT_EQ(2UL, net->ops_.size());
f::OperatorBase &d_sigmoid = *net->ops_[0];
ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
f::OperatorBase &d_mul = *net->ops_[1];
ASSERT_EQ("mul_grad", d_mul.Type());
}
TEST(Backward, net_input_of_network_not_need_grad) {
ops::NetOp net;
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_tmp_0"}},
{"add_result", {"add_tmp_0"}},
{"Out", {"hidden0"}}},
f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_tmp_1"}},
{"add_result", {"add_tmp_1"}},
{"Out", {"hidden1"}}},
f::AttributeMap{}));
net.CompleteAddOp();
auto bwd = Backward(net, {"x"}); // x@GRAD is not need.
ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
auto output_vars = bwd_net->OutputVars(true);
std::unordered_set<std::string> all_outputs =
std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
all_outputs.erase(f::kEmptyVarName);
for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
}
// Not Generated X
ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
ASSERT_EQ(2UL, bwd_net->ops_.size());
ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
ASSERT_EQ(3UL, first_fc_grad->ops_.size());
ASSERT_EQ(f::kEmptyVarName,
first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
}
TEST(Backward, net_shared_weight) {
ops::NetOp net;
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
{{"Out", {"out"}}}, f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
{{"Out", {"FinalOut"}}},
f::AttributeMap{}));
net.CompleteAddOp();
auto bwd = f::Backward(net, std::unordered_set<std::string>{});
ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
ASSERT_EQ(3UL, bwd_net->ops_.size());
ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
}
TEST(Backward, op_all_input_are_not_need) {
auto fwd =
f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
{{"Out", {"out"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"x", "b"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast<ops::NetOp *>(backward.get());
ASSERT_TRUE(net->ops_.empty());
}
TEST(Backward, op_all_output_are_not_need) {
auto fwd =
f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
{{"Out", {"out"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"out"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast<ops::NetOp *>(backward.get());
ASSERT_TRUE(net->ops_.empty());
}
TEST(Backward, op_part_of_output_are_not_need) {
auto fwd =
f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
{{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"Z"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast<ops::NetOp *>(backward.get());
ASSERT_EQ(net->ops_.size(), 2UL);
auto &fill_zero = *net->ops_[0];
ASSERT_EQ("fill_zeros_like", fill_zero.Type());
ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
ASSERT_EQ("Z", fill_zero.Input("X"));
ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
auto &d_many_out = *net->ops_[1];
ASSERT_EQ("many_output_op_grad", d_many_out.Type());
ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size()); // I/O/OG
ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
d_many_out.Input(f::GradVarName("z")));
ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
}
TEST(Backward, op_part_of_input_are_not_need) {
auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
{{"Out", {"out"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"a"});
auto &grad_mul = *backward;
ASSERT_EQ(grad_mul.Type(), "mul_grad");
ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
ASSERT_EQ(grad_mul.Input("X"), "a");
ASSERT_EQ(grad_mul.Input("Y"), "b");
ASSERT_EQ(grad_mul.Input("Out"), "out");
}
TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
ops::NetOp net;
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_out1"}},
{"add_result", {"add_out1"}},
{"Out", {"out1"}}},
f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_out2"}},
{"add_result", {"tmp_out2"}},
{"Out", {"out2"}}},
f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
{{"mul_result", {"mul_out3"}},
{"add_result", {"tmp_out3"}},
{"Out", {"out3"}}},
f::AttributeMap{}));
net.CompleteAddOp();
auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
ASSERT_TRUE(backward->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(backward.get());
ASSERT_EQ(bwd_net->ops_.size(), 3UL);
auto &grad_fc = *bwd_net->ops_[0];
const char *all = paddle::operators::NetOp::kAll;
EXPECT_EQ(grad_fc.Inputs(all).size(),
2UL /* external input number */
+ 1UL /* external output number*/
+ 1UL /* number of gradient of external output*/
+ 2UL /* internal variable number*/
);
EXPECT_EQ(grad_fc.Outputs(all).size(),
2UL /* input number of mul*/
+ 2UL /* input number of rowwise_add*/
+ 1UL /* input number of sigmod */
- 1UL /* out2 is not needed*/);
EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
}
TEST(Backward, simple_single_op) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::OpDesc *op = block->AppendOp();
op->SetType("rowwise_add");
op->SetInput("X", {"x"});
op->SetInput("b", {"b"});
op->SetOutput("Out", {"out"});
auto target = f::VarDesc("out");
target.SetShape({1});
auto var_to_grad =
AppendBackward(program, target, std::unordered_set<std::string>{});
ASSERT_EQ(block->AllOps().size(), 3UL);
f::OpDesc *fill_op = block->AllOps()[1];
EXPECT_EQ(fill_op->Type(), "fill_constant");
f::OpDesc *grad_op = block->AllOps()[2];
EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op->InputNames().size(), 1UL);
ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b")}));
EXPECT_EQ(var_to_grad.size(), 3UL);
EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
}
TEST(Backward, default_attribute) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::OpDesc *op = block->AppendOp();
op->SetType("mul");
op->SetInput("X", {"x"});
op->SetInput("Y", {"y"});
op->SetOutput("Out", {"out"});
op->CheckAttrs();
auto target = f::VarDesc("out");
target.SetShape({1});
AppendBackward(program, target, std::unordered_set<std::string>{});
ASSERT_EQ(block->AllOps().size(), 3UL);
EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
f::OpDesc *fill_op = block->AllOps()[1];
EXPECT_EQ(fill_op->Type(), "fill_constant");
f::OpDesc *grad_op = block->AllOps()[2];
ASSERT_EQ(grad_op->Type(), "mul_grad");
EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
}
TEST(Backward, simple_mult_op) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::OpDesc *op1 = block->AppendOp();
op1->SetType("rowwise_add");
op1->SetInput("X", {"x1"});
op1->SetInput("b", {"b1"});
op1->SetOutput("Out", {"out1"});
f::OpDesc *op2 = block->AppendOp();
op2->SetType("mul");
op2->SetInput("X", {"out1"});
op2->SetInput("Y", {"y2"});
op2->SetOutput("Out", {"out2"});
f::OpDesc *op3 = block->AppendOp();
op3->SetType("rowwise_add");
op3->SetInput("X", {"out2"});
op3->SetInput("b", {"b3"});
op3->SetOutput("Out", {"out3"});
auto target = f::VarDesc("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad =
AppendBackward(program, target, std::unordered_set<std::string>{});
ASSERT_EQ(block->AllOps().size(), 6UL + 1);
f::OpDesc *fill_op = block->AllOps()[forward_len];
EXPECT_EQ(fill_op->Type(), "fill_constant");
f::OpDesc *grad_op1 = block->AllOps()[6];
EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b1")}));
f::OpDesc *grad_op2 = block->AllOps()[5];
EXPECT_EQ(grad_op2->Type(), "mul_grad");
ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out2")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y2")}));
f::OpDesc *grad_op3 = block->AllOps()[4];
EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out3")}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out2")}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b3")}));
EXPECT_EQ(var_to_grad.size(), 7UL);
EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
EXPECT_EQ(var_to_grad.at("out1"),
f::GradVarInfo(f::GradVarName("out1"), 0, 5));
EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
EXPECT_EQ(var_to_grad.at("out2"),
f::GradVarInfo(f::GradVarName("out2"), 0, 4));
EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
}
TEST(Backward, intermedia_var_no_grad) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::OpDesc *op1 = block->AppendOp();
op1->SetType("rowwise_add");
op1->SetInput("X", {"x1"});
op1->SetInput("b", {"b1"});
op1->SetOutput("Out", {"out1"});
f::OpDesc *op2 = block->AppendOp();
op2->SetType("mul");
op2->SetInput("X", {"x2"});
op2->SetInput("Y", {"y2"});
op2->SetOutput("Out", {"out2"});
f::OpDesc *op3 = block->AppendOp();
op3->SetType("rowwise_add");
op3->SetInput("X", {"out2"});
op3->SetInput("b", {"b3"});
op3->SetOutput("Out", {"out3"});
f::OpDesc *op4 = block->AppendOp();
op4->SetType("mul");
op4->SetInput("X", {"out1"});
op4->SetInput("Y", {"out3"});
op4->SetOutput("Out", {"out4"});
auto target = f::VarDesc("out4");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"out3"});
ASSERT_EQ(block->AllOps().size(), 7UL);
f::OpDesc *fill_op = block->AllOps()[forward_len];
EXPECT_EQ(fill_op->Type(), "fill_constant");
f::OpDesc *grad_op1 = block->AllOps()[6];
EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b1")}));
f::OpDesc *grad_op4 = block->AllOps()[5];
EXPECT_EQ(grad_op4->Type(), "mul_grad");
ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out4")}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
EXPECT_EQ(var_to_grad.size(), 4UL);
EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
EXPECT_EQ(var_to_grad.at("out1"),
f::GradVarInfo(f::GradVarName("out1"), 0, 5));
EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
}
TEST(Backward, var_no_grad) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::OpDesc *op1 = block->AppendOp();
op1->SetType("mult_in_out");
op1->SetInput("X", {"x1"});
op1->SetInput("H", {"h1"});
op1->SetOutput("Y", {"y1"});
op1->SetOutput("Z", {"z1"});
f::OpDesc *op2 = block->AppendOp();
op2->SetType("mult_in_out");
op2->SetInput("X", {"y1"});
op2->SetInput("H", {"z1"});
op2->SetOutput("Y", {"y2"});
op2->SetOutput("Z", {"z2"});
auto target = f::VarDesc("z2");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"z1"});
ASSERT_EQ(block->AllOps().size(), 6UL);
f::OpDesc *fill_op = block->AllOps()[forward_len];
EXPECT_EQ(fill_op->Type(), "fill_constant");
f::OpDesc *grad_op2 = block->AllOps()[3];
ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y2")}));
EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
std::vector<std::string>({f::GradVarName("z2")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("y1")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
f::OpDesc *fill_zero_op = block->AllOps()[4];
ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
EXPECT_EQ(fill_zero_op->Output("Out"),
std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
f::OpDesc *grad_op1 = block->AllOps()[5];
ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y1")}));
EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
std::vector<std::string>({f::GradVarName("h1")}));
EXPECT_EQ(var_to_grad.size(), 4UL);
EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
}
TEST(Backward, shared_var) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::OpDesc *op1 = block->AppendOp();
op1->SetType("rowwise_add");
op1->SetInput("X", {"x1"});
op1->SetInput("b", {"b1"});
op1->SetOutput("Out", {"out1"});
f::OpDesc *op2 = block->AppendOp();
op2->SetType("mul");
op2->SetInput("X", {"out1"});
op2->SetInput("Y", {"y2"});
op2->SetOutput("Out", {"out2"});
f::OpDesc *op3 = block->AppendOp();
op3->SetType("rowwise_add");
op3->SetInput("X", {"out1"});
op3->SetInput("b", {"b3"});
op3->SetOutput("Out", {"out3"});
auto target = f::VarDesc("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad =
AppendBackward(program, target, std::unordered_set<std::string>{});
ASSERT_EQ(block->AllOps().size(), 8UL);
f::OpDesc *fill_op = block->AllOps()[forward_len];
EXPECT_EQ(fill_op->Type(), "fill_constant");
f::OpDesc *grad_op3 = block->AllOps()[4];
ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out3")}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b3")}));
f::OpDesc *grad_op4 = block->AllOps()[5];
ASSERT_EQ(grad_op4->Type(), "mul_grad");
ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out2")}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y2")}));
f::OpDesc *sum_op = block->AllOps()[6];
ASSERT_EQ(sum_op->Type(), "sum");
ASSERT_EQ(sum_op->InputNames().size(), 1UL);
ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
EXPECT_EQ(sum_op->Input("X"),
std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
f::GradVarName("out1") + "@RENAME@1"}));
EXPECT_EQ(sum_op->Output("Out"),
std::vector<std::string>({f::GradVarName("out1")}));
f::OpDesc *grad_op1 = block->AllOps()[7];
ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b1")}));
EXPECT_EQ(var_to_grad.size(), 6UL);
EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
EXPECT_EQ(var_to_grad.at("out1"),
f::GradVarInfo(f::GradVarName("out1"), 0, 6));
EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
}
TEST(Backward, half_backward) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
auto *op1 = block->AppendOp();
op1->SetType("minus");
op1->SetInput("X", {"a"});
op1->SetInput("Y", {"b"});
op1->SetOutput("Out", {"out"});
auto target = f::VarDesc("out");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"b"});
f::OpDesc *fill_op = block->AllOps()[forward_len];
EXPECT_EQ(fill_op->Type(), "fill_constant");
auto ops = block->AllOps();
ASSERT_EQ(3UL, ops.size());
EXPECT_EQ(var_to_grad.size(), 2UL);
EXPECT_EQ(var_to_grad.at("a"),
f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
}
......@@ -92,7 +92,7 @@ class BlockDesc {
/*
* Remove Op and its input/output variables.
* Note that for either input or ouput variable, if it is also an input or
* Note that for either input or output variable, if it is also an input or
* output variable of other ops, we should remain it.
*/
void RemoveOp(size_t s, size_t e);
......
......@@ -5,6 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
......@@ -15,7 +16,7 @@ else()
set(multi_devices_graph_builder_deps)
endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
......@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include <string>
namespace paddle {
namespace framework {
namespace details {
......@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
}
}
op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
}
std::string ComputationOpHandle::Name() const { return op_->Type(); }
......
......@@ -14,6 +14,9 @@
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string>
#include <vector>
namespace paddle {
namespace framework {
namespace details {
......@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i];
auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
auto &t = scope->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(var_name)
->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/send_op_handle.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
......@@ -54,6 +55,27 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
}
}
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
const platform::Place &p,
const size_t &i) const {
auto *op_handle = result->ops_.back().get();
op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
platform::DeviceContextPool::Instance().Get(p));
auto var_names = op->InputArgumentNames();
for (auto &each_var_name : var_names) {
VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
op_handle->AddInput(var);
}
var_names = op->OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(result, op_handle, each_var_name, p, i);
}
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
auto graph = new SSAGraph();
......@@ -76,27 +98,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
}
}
// append send op if program is distributed trainer main program.
// always use the first device
if (!is_forwarding && op->Type() == "send") {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result.ops_.emplace_back(new SendOpHandle(*op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(&result, op, p, 0);
continue;
}
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i];
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get();
op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
platform::DeviceContextPool::Instance().Get(p));
CreateOpHandleIOs(&result, op, p, i);
auto var_names = op->InputArgumentNames();
for (auto &each_var_name : var_names) {
VarHandle *var =
CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
op_handle->AddInput(var);
}
var_names = op->OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(&result, op_handle, each_var_name, p, i);
}
auto var_names = op->OutputArgumentNames();
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
......
......@@ -14,6 +14,9 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle {
......@@ -41,6 +44,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
private:
void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
const size_t &i) const;
private:
std::string loss_var_name_;
const std::vector<platform::Place> &places_;
......
......@@ -24,6 +24,8 @@ namespace paddle {
namespace framework {
namespace details {
constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
class OpHandleBase {
private:
DISABLE_COPY_AND_ASSIGN(OpHandleBase);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/send_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
const Scope *local_scope,
const platform::Place &place)
: op_(framework::OpRegistry::CreateOp(op_desc)),
local_scope_(local_scope),
place_(place) {}
void SendOpHandle::RunImpl() {
// Wait input done
for (auto *in : inputs_) {
auto &p = static_cast<VarHandle *>(in)->place_;
if (in->DebugString() == "dummy") { // HACK
continue;
}
in->generated_op_->Wait(dev_ctxes_[p]);
}
op_->Run(*local_scope_, place_);
}
std::string SendOpHandle::Name() const { return "send"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
struct SendOpHandle : public OpHandleBase {
std::unique_ptr<OperatorBase> op_;
const Scope* local_scope_;
const platform::Place& place_;
SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
const platform::Place& place);
std::string Name() const override;
// Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -15,13 +15,15 @@
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
namespace details {
class SSAGraphExecutor {
DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
......
......@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
ready_ops.clear();
};
// Create local scopes.
for (auto &scope : local_scopes_) {
auto &local_scope = scope->NewScope();
*scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
}
// Step 3. Execution
while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
// 1. Run All Ready ops
......@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
PADDLE_ENFORCE(ready_ops.empty());
PADDLE_ENFORCE(delayed_ops.empty());
PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
++computation_count_;
auto sync_computation = [&] {
computation_count_ = 0;
// Wait All computational streams
for (auto p : this->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
for (auto &scope : local_scopes_) {
scope->DropKids();
}
};
// Wait FetchOps.
if (!fetch_ops.empty()) {
fetch_ops.clear();
sync_computation();
}
if (computation_count_ == max_async_computation) {
sync_computation();
}
// NOTE: the temp scope can be dropped lazily if needed.
// Drop tmp scopes;
for (auto &scope : local_scopes_) {
auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
kid = nullptr;
}
return fetch_data;
......
......@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::unique_ptr<platform::EnforceNotMet> exception_;
std::atomic<int> running_ops_;
bool allow_op_delay_;
size_t computation_count_{0};
size_t max_async_computation{100};
};
} // namespace details
......
......@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
}
}
static DDim GetDims(const Scope& scope, const std::string& name) {
static DDim GetDims(const Scope& scope, const std::string& name,
bool get_actual_dim = false) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
return DDim({-1});
......@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims();
} else if (var->IsType<SelectedRows>()) {
return var->Get<SelectedRows>().GetCompleteDims();
if (get_actual_dim) {
return var->Get<SelectedRows>().value().dims();
} else {
return var->Get<SelectedRows>().GetCompleteDims();
}
} else {
return DDim({-1});
}
......@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
for (size_t i = 0; i < input.second.size(); ++i) {
ss << input.second[i];
if (scope) {
ss << "[" << GetDims(*scope, input.second[i]) << "]";
ss << "[" << GetDims(*scope, input.second[i], true) << "]";
ss << "(" << GetLoD(*scope, input.second[i]) << ")";
}
if (i != input.second.size() - 1) {
......@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
for (size_t i = 0; i < output.second.size(); ++i) {
ss << output.second[i];
if (scope) {
ss << "[" << GetDims(*scope, output.second[i]) << "]";
ss << "[" << GetDims(*scope, output.second[i], true) << "]";
ss << "(" << GetLoD(*scope, output.second[i]) << ")";
}
if (i != output.second.size() - 1) {
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include <string>
#include <tuple>
#include <vector>
#ifdef PADDLE_WITH_CUDA
......@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
};
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
......@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
allow_op_delay));
// Step 3. Create vars in each scope;
for (auto *scope : member_->local_scopes_) {
for (auto *var : main_program.Block(0).AllVars()) {
if (scope->FindVar(var->Name()) != nullptr) {
continue;
}
InitializeVariable(scope->Var(var->Name()), var->GetType());
}
for (auto *var : main_program.Block(0).AllVars()) {
member_->var_types_.emplace_back(var->Name(), var->GetType(),
var->Persistable());
}
}
......@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
platform::RecordBlock b(0);
SplitTensorToPlaces(feed_tensors);
// Create local scopes.
for (auto &scope : member_->local_scopes_) {
Scope &local_scope = scope->NewScope();
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
&local_scope;
for (auto &name_type_pair : member_->var_types_) {
if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
continue;
}
if (std::get<2>(name_type_pair)) { // Persistable
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
} else {
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
}
}
}
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
fetch_data;
// Wait All computational streams
for (auto p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
for (auto &scope : member_->local_scopes_) {
auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope);
local_scope = nullptr;
}
}
void ParallelExecutor::SplitTensorToPlaces(
......
......@@ -48,13 +48,13 @@ class ParallelExecutor {
const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
private:
void SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
};
} // namespace framework
......
......@@ -14,18 +14,17 @@ limitations under the License. */
#include "paddle/fluid/framework/prune.h"
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include <gtest/gtest.h>
namespace f = paddle::framework;
namespace ops = paddle::operators;
void AddOp(const std::string &type, const f::VariableNameMap &inputs,
const f::VariableNameMap &outputs, f::AttributeMap attrs,
......
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
cc_library(paddle_fluid_api
SRCS io.cc
......@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
# Create shared library
cc_library(paddle_fluid_shared SHARED
SRCS io.cc
DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
DEPS ${fluid_modules})
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE)
# TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
......
......@@ -17,10 +17,16 @@ limitations under the License. */
#include <fstream>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace inference {
// Temporarilly add this function for exposing framework::InitDevices() when
// linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
void ReadBinaryFile(const std::string& filename, std::string& contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
......
......@@ -18,12 +18,15 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace inference {
void Init(bool init_p2p);
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
......
......@@ -17,14 +17,15 @@ function(inference_test TARGET_NAME)
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_inference_${TARGET_NAME}${arg}
SRCS test_inference_${TARGET_NAME}.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
DEPS paddle_fluid
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
set_tests_properties(test_inference_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
endforeach()
endfunction(inference_test)
inference_test(fit_a_line)
# This unittest is buggy!
#inference_test(fit_a_line)
inference_test(image_classification ARGS vgg resnet)
inference_test(label_semantic_roles)
inference_test(recognize_digits ARGS mlp conv)
......
......@@ -100,7 +100,7 @@ function(op_library TARGET)
endif()
# Define operators that don't need pybind here.
foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......@@ -199,7 +199,6 @@ else()
set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
endif()
op_library(cond_op DEPS framework_proto tensor net_op)
op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax)
......@@ -259,7 +258,6 @@ endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
......
......@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
const auto *bias = ctx.Input<Tensor>("Bias");
auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
// alloc memory
y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
auto handle = dev_ctx.cudnn_handle();
......@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
// Run training mode.
// obtain running mean and running inv var, and see if we need to
// initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
double this_factor = 1. - momentum;
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
#include <string>
#include <vector>
......@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
const size_t n = ins.size();
PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
if (n == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory";
}
auto out_dims = ins[0];
size_t in_zero_dims_size = out_dims.size();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/scatter.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
using Scope = framework::Scope;
using Variable = framework::Variable;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DDim = framework::DDim;
framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
auto sub_scopes_var = scope.FindVar("SubScopes");
PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
"Output(SubScopes) of CondOp should not be null.");
auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
auto& sub_scope = scope.NewScope();
sub_scopes->push_back(&sub_scope);
return sub_scope;
}
std::vector<framework::Scope*>& CondOp::GetSubScopes(
const framework::Scope& scope) const {
auto sub_scopes_var = scope.FindVar("SubScopes");
PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
"Output(SubScopes) of CondOp should not be null.");
return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
}
LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
auto index_tensors_var = scope.FindVar("IndexTensors");
PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
"Output(IndexTensors) of CondOp should not be null.");
auto& index_tensors =
*index_tensors_var->GetMutable<std::vector<LoDTensor>>();
index_tensors.push_back(LoDTensor());
return index_tensors.back();
}
std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
const framework::Scope& scope) const {
auto* index_tensors_var = scope.FindVar("IndexTensors");
PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
"Output(IndexTensors) of CondOp should not be null.");
return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
}
void CondOp::PrepareDataForSubnet(
const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const {
PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
for (int i = 0; i < BRANCH_NUM; ++i) {
// Create two sub scopes for true and false branches
// sub_scopes[0] for the true branch
// sub_scopes[1] for the false branch
AddSubScope(scope);
// Create two tensors for true and false indices:
// index_tensors[0] for the true branch
// index_tensors[1] for the false branch
AddIndexTensor(scope);
}
Variable* cond_var = scope.FindVar(Input("Cond"));
PADDLE_ENFORCE_NOT_NULL(cond_var,
"Input(Cond) of CondOp should not be null.");
const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
// get the true/false index at runtime according to cond tensor
// index_vectors[0]: vector<int>, contains all index for cond[i] == true
// index_vectors[1]: vector<int>, contains all index for cond[i] == false
std::vector<std::vector<int>> index_vectors;
index_vectors.resize(BRANCH_NUM);
const int* cond_data = cond->data<int>();
for (int i = 0; i < cond->dims()[0]; ++i) {
if (cond_data[i])
index_vectors[TRUE_BRANCH].push_back(i);
else
index_vectors[FALSE_BRANCH].push_back(i);
}
// put index_vectors[0] and index_vectors[1] into two tensors:
// index_tensors[0] and index_tensors[1]
std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
for (int i = 0; i < BRANCH_NUM; ++i) {
DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
int* index_tensor_data_ptr =
index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
memcpy(index_tensor_data_ptr, index_vectors[i].data(),
dim[0] * sizeof(int));
}
// create input in subscopes according to index_vectors
for (auto& input : Inputs("Xs")) {
Variable* var_parent = scope.FindVar(input);
PADDLE_ENFORCE_NOT_NULL(var_parent);
const auto* tensor_parent = &var_parent->Get<LoDTensor>();
for (int i = 0; i < BRANCH_NUM; ++i) {
Variable* var_child = sub_scopes[i]->FindVar(input);
PADDLE_ENFORCE_NOT_NULL(var_child);
auto* tensor_child = var_child->GetMutable<LoDTensor>();
// Resize child
DDim dim = tensor_parent->dims();
dim[0] = index_tensors[i].dims()[0];
tensor_child->mutable_data<float>(dim, platform::CPUPlace());
CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
}
}
// create output_tensors in subscope for sub_net
for (int i = 0; i < BRANCH_NUM; ++i) {
for (auto& output : (*sub_net_op_[i]).Outputs()) {
for (auto& var_name : output.second) {
sub_scopes[i]->Var(var_name);
}
}
}
}
void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const {
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
const std::vector<framework::LoDTensor>& index_tensors =
GetIndexTensors(scope);
// Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
PADDLE_ENFORCE(!Outputs("Outs").empty(),
"Outputs(Outs) of CondOp can't be empty.");
for (auto& output : Outputs("Outs")) {
const LoDTensor* tensor_t_out =
&sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
const LoDTensor* tensor_f_out =
&sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
auto* var_out = scope.FindVar(output);
PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
"True output tensor should not be NULL");
DDim true_dim = tensor_t_out->dims();
DDim false_dim = tensor_f_out->dims();
true_dim[0] = 0;
false_dim[0] = 0;
PADDLE_ENFORCE_EQ(true_dim, false_dim,
"Outputs not of the same shape except the first dim");
DDim out_dim = tensor_t_out->dims();
out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
tensor_out->Resize(out_dim);
tensor_out->mutable_data<float>(platform::CPUPlace());
}
// merge output results:
// output_tensor = true_output_tensor + false_output_tensor
for (auto& output : Outputs("Outs")) {
Variable* var_parent = scope.FindVar(output);
PADDLE_ENFORCE_NOT_NULL(var_parent);
auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
for (int i = 0; i < BRANCH_NUM; ++i) {
Variable* var_child = sub_scopes[i]->FindVar(output);
PADDLE_ENFORCE_NOT_NULL(var_child);
auto* tensor_child = &var_child->Get<LoDTensor>();
ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
tensor_parent);
}
}
}
void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
// get device context from pool
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Get(place);
PrepareDataForSubnet(scope, dev_ctx);
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
for (int i = 0; i < BRANCH_NUM; ++i) {
sub_net_op_[i]->Run(*sub_scopes[i], place);
}
MergeDataFromSubnet(scope, dev_ctx);
}
class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
public:
CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Cond", "The condition, which is a bool vector");
AddInput("Xs", "Inputs of Subnets").AsDuplicable();
AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
AddOutput("SubScopes", "sub scopes for true and false branches");
AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
AddComment(R"DOC(
Sample Dependent Conditional Operator.
Given Cond[i] as a 1/0 vector to indicate true/false:
Out[i] = subnet_true[i], if Cond[i] == true
Out[i] = subnet_false[i], if Cond[i] == false
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
paddle::operators::CondOpProtoAndCheckerMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/net_op.h"
namespace paddle {
namespace operators {
/*
* @brief CondOp is a dynamic if-else Operator
*
* It has a input tensor named cond indicating which netop each instance will
* run.
*
* if cond == 1, it will run true_net, which is a NetOp.
*
* if cond == 0, it will run false_net, which is another NetOp.
*/
class CondOp : public framework::OperatorBase {
public:
CondOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {
sub_net_op_.resize(BRANCH_NUM);
}
CondOp(const CondOp& o)
: framework::OperatorBase(
static_cast<const framework::OperatorBase&>(o)) {
// TODO(yuyang18): Implement copy ctor well.
PADDLE_THROW("Not implemented");
}
framework::Scope& AddSubScope(const framework::Scope& scope) const;
std::vector<framework::Scope*>& GetSubScopes(
const framework::Scope& scope) const;
framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
std::vector<framework::LoDTensor>& GetIndexTensors(
const framework::Scope& scope) const;
void PrepareDataForSubnet(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const;
void MergeDataFromSubnet(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const;
/*
* Set True Block
*/
void set_truenet(std::unique_ptr<OperatorBase>&& net) {
sub_net_op_[TRUE_BRANCH] = std::move(net);
}
/*
* Set False Block
*/
void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
sub_net_op_[FALSE_BRANCH] = std::move(net);
}
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override;
private:
const int TRUE_BRANCH = 0;
const int FALSE_BRANCH = 1;
const int BRANCH_NUM = 2;
// sub_net_op_[0]: subnet_t
// sub_net_op_[1]: subnet_f
std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
};
} // namespace operators
} // namespace paddle
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <vector>
#include "paddle/fluid/operators/ctc_align_op.h"
namespace paddle {
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <string.h>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
}
void ProcGetResponse(const VarHandle& var_h,
// const sendrecv::VariableMessage& ret_msg) {
const ::grpc::ByteBuffer& ret_msg) {
framework::Variable* outvar = NULL;
framework::Variable* outvar = nullptr;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
}
......
......@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
::grpc::ByteBuffer reply;
std::string var_name = request_->OutVarname();
VLOG(3) << "prefetch var " << var_name;
auto var_desc = program_->Block(0).FindVar(var_name);
framework::Scope* local_scope = &scope_->NewScope();
auto* var = local_scope->FindVar(var_name);
......
......@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
}
for (int64_t i = 0; i < rows2->size(); ++i) {
for (size_t i = 0; i < rows2->size(); ++i) {
EXPECT_EQ(rows_data2[i], i);
}
EXPECT_EQ(slr2->height(), 1000);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
......@@ -106,18 +107,18 @@ information. However, the output only shares the LoD information with input $X$.
protected:
std::string comment_;
void Replace(std::string& src, std::string from, std::string to) {
void Replace(std::string* src, std::string from, std::string to) {
std::size_t len_from = std::strlen(from.c_str());
std::size_t len_to = std::strlen(to.c_str());
for (std::size_t pos = src.find(from); pos != std::string::npos;
pos = src.find(from, pos + len_to)) {
src.replace(pos, len_from, to);
for (std::size_t pos = src->find(from); pos != std::string::npos;
pos = src->find(from, pos + len_to)) {
src->replace(pos, len_from, to);
}
}
void SetComment(std::string name, std::string equation) {
Replace(comment_, "{name}", name);
Replace(comment_, "{equation}", equation);
Replace(&comment_, "{name}", name);
Replace(&comment_, "{equation}", equation);
}
};
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/im2sequence_op.h"
#include <vector>
namespace paddle {
namespace operators {
......
......@@ -13,7 +13,7 @@
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/label_smooth_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
auto x_row_max = EigenMatrix<T>::From(emission_row_max);
x_row_max.device(place) =
x.maximum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
.reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
auto x_exps = EigenMatrix<T>::From(*emission_exps);
x_exps.device(place) =
......
......@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include <thread>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/operators/listen_and_serv_op.h"
......@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
auto ins = Inputs("X");
auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program();
auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
auto *program = optimize_block->Program();
size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks");
......@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
framework::Executor executor(dev_place);
std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid);
if (blkid != prefetch_block->ID()) {
block_list.push_back(blkid);
}
}
auto prepared = executor.Prepare(*program, block_list);
auto optimize_prepared = executor.Prepare(*program, block_list);
// Insert placeholder for block0 which holds current op itself.
prepared.insert(prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
optimize_prepared.insert(
optimize_prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
// TODO(qiao) set proper fields for table lookup and update
rpc_service_->SetExecutor(&executor);
rpc_service_->SetPrefetchBlkdId(0);
VLOG(3) << "prefetch block id is " << prefetch_block->ID();
auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
prefetch_prepared.release();
rpc_service_->SetProgram(program);
// start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_));
......@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
parallel_blkids.push_back(1);
double ts = detail::GetTimestamp();
for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
if (program->Block(blkid).Parent() != last_parent_blkid) {
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);
parallel_blkids.clear();
last_parent_blkid = program->Block(blkid).Parent();
if (blkid != prefetch_block->ID()) {
if (program->Block(blkid).Parent() != last_parent_blkid) {
ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
program, &recv_scope);
parallel_blkids.clear();
last_parent_blkid = program->Block(blkid).Parent();
}
parallel_blkids.push_back(blkid);
}
parallel_blkids.push_back(blkid);
}
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);
ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
program, &recv_scope);
VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
// Reset the received sparse variables, the sum operator would not
......@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDesc *>(kOptimizeBlock,
"BlockID to run on server side.");
AddAttr<framework::BlockDesc *>(kPrefetchBlock,
"prefetch block to run on server side.");
AddAttr<int>("Fanin", "How many clients send to this server.")
.SetDefault(1);
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <stdint.h>
#include <ostream>
#include <string>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -27,6 +28,7 @@ namespace paddle {
namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
constexpr char kPrefetchBlock[] = "PrefetchBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/logical_op.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
......
......@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) "
"Sparse update.")
.SetDefault(false);
AddAttr<bool>("is_distributed",
"(boolean, default false) distributed lookup table.")
.SetDefault(false);
AddAttr<int64_t>("padding_idx",
"(int64, default -1) "
"If the value is -1, it makes no effect to lookup. "
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lrn_op.h"
#include <string>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lstm_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
......
......@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/operators/lstm_unit_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/hostdevice.h"
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lstmp_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
......
......@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
......@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
......@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/matmul_op.h"
#include <algorithm>
#include <vector>
namespace paddle {
namespace operators {
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <functional>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matmul.h"
......
......@@ -13,6 +13,8 @@
* limitations under the License. */
#include "paddle/fluid/operators/maxout_op.h"
#include <vector>
namespace paddle {
namespace operators {
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/minus_op.h"
#include "paddle/fluid/operators/net_op.h"
#include <string>
#include <vector>
namespace paddle {
namespace operators {
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/momentum_op.h"
namespace paddle {
namespace operators {
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include <vector>
namespace paddle {
namespace operators {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/net_op.h"
#include <set>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
const char NetOp::kAll[] = "all";
void NetOp::CompleteAddOp(bool calc) {
add_op_done_ = true;
if (!calc) return;
std::set<std::string> input_set;
std::set<std::string> output_set;
for (auto& op : ops_) {
for (auto& ipt : op->Inputs()) {
for (auto& var_name : ipt.second) {
// If input variable has been in output set, then it will be
// added into intermediate_outputs_. Otherwise, it will be
// added into input set.
if (Contains(output_set, var_name)) {
intermediate_outputs_.insert(var_name);
} else {
input_set.insert(var_name);
}
}
}
for (auto& opt : op->Outputs()) {
for (auto& var_name : opt.second) {
output_set.insert(var_name);
}
}
}
auto& inputs = inputs_[kAll];
inputs.reserve(input_set.size());
std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
auto& outputs = outputs_[kAll];
outputs.reserve(output_set.size());
std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
}
std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
std::ostringstream os;
os << OperatorBase::DebugStringEx(scope) << std::endl;
for (auto& op : ops_) {
std::istringstream is(op->DebugStringEx(scope));
for (std::string line; std::getline(is, line);) {
os << " " << line << std::endl;
}
}
return os.str();
}
bool NetOp::IsNetOp() const { return true; }
std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
std::vector<std::string> all;
for (auto& pair : this->outputs_) {
for (auto& var_name : pair.second) {
all.push_back(var_name);
}
}
if (has_intermediate) {
return all;
}
std::vector<std::string> ret_val;
for (auto& each : all) {
if (!Contains(intermediate_outputs_, each)) {
ret_val.push_back(each);
}
}
return ret_val;
}
NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
PADDLE_ENFORCE(
add_op_done_,
"Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
return std::unique_ptr<OperatorBase>(new NetOp(*this));
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <set>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
/**
* @brief Network is also a type of Operator
*
* It will manage the operators it has.
*
* Network is the container and controller of a set of operators.
* A network object knows all Operators belonging to this network. Variables,
* which are inputs and outputs of these operators, are created and managed by a
* hierarchy of Scope objects.
*
* This is the base class of network, all the networks should implement the APIs
* it defines.
*/
class NetOp : public framework::OperatorBase {
public:
static const char kAll[];
NetOp()
: framework::OperatorBase("plain_net", framework::VariableNameMap{},
framework::VariableNameMap{},
framework::AttributeMap{}) {}
NetOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs);
NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
this->ops_.reserve(o.ops_.size());
std::transform(
o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
[](const std::unique_ptr<framework::OperatorBase>& op) {
return std::unique_ptr<framework::OperatorBase>(op->Clone());
});
this->CompleteAddOp();
}
bool SupportGPU() const override {
for (auto& op : ops_) {
if (!op->SupportGPU()) {
return false;
}
}
return true;
}
void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
/**
* @brief Add an operator by ptr
*/
void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
PADDLE_ENFORCE(!add_op_done_,
"Cannot AppendOp when this network is sealed");
PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
ops_.push_back(std::move(op));
}
void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
PADDLE_ENFORCE(!add_op_done_,
"Cannot InsertOp when this network is sealed");
PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
ops_.insert(ops_.begin() + pos, std::move(op));
}
void InsertOp(size_t pos, const framework::OperatorBase& op) {
InsertOp(pos, op.Clone());
}
void CompleteAddOp(bool calculate = true);
std::string DebugStringEx(
const framework::Scope* scope = nullptr) const override;
bool IsNetOp() const override;
std::vector<std::string> OutputVars(bool has_intermediate) const override;
std::unique_ptr<framework::OperatorBase> Clone() const override;
std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
private:
/**
* @brief Run the network.
*
* Run all the operators with the `scope`, if no scope is provided, default
* scope will be used instead. If no OpContext is provicded, default context
* will be used.
*/
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
for (auto& op : ops_) {
op->Run(scope, place);
}
}
bool add_op_done_{false};
std::set<std::string> intermediate_outputs_;
template <typename T, typename KeyType>
static bool Contains(T container, KeyType key) {
return container.find(key) != container.end();
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/net_op.h"
#include <gtest/gtest.h>
namespace paddle {
namespace operators {
using Scope = framework::Scope;
using DeviceContext = platform::DeviceContext;
static int run_cnt = 0;
class TestOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
DEFINE_OP_CLONE_METHOD(TestOp);
private:
void RunImpl(const Scope& scope,
const platform::Place& place) const override {
++run_cnt;
}
};
template <typename T>
void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
const std::vector<T>& actual) {
ASSERT_EQ(expected.size(), actual.size());
std::unordered_set<T> expected_set;
for (auto& tmp : expected) {
expected_set.insert(tmp);
}
for (auto& act : actual) {
ASSERT_NE(expected_set.end(), expected_set.find(act));
}
}
TEST(OpKernel, all) {
auto net = std::make_shared<NetOp>();
ASSERT_NE(net, nullptr);
net->AppendOp(std::unique_ptr<TestOp>(
new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"Out", {"y"}}}, framework::AttributeMap{})));
net->AppendOp(std::unique_ptr<TestOp>(
new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"Out", {"z"}}}, framework::AttributeMap{})));
net->CompleteAddOp();
AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
net->Inputs(NetOp::kAll));
AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
auto final_outs = net->OutputVars(false);
ASSERT_EQ(final_outs.size(), 1UL);
ASSERT_EQ(final_outs[0], "z");
}
TEST(NetOp, insert_op) {
NetOp net;
auto op1 = std::unique_ptr<framework::NOP>(
new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"Out", {"y"}}}, framework::AttributeMap{}));
net.AppendOp(*op1);
net.InsertOp(0, *op1);
ASSERT_EQ(2UL, net.ops_.size());
net.InsertOp(2, std::move(op1));
ASSERT_EQ(3UL, net.ops_.size());
}
TEST(NetOp, Clone) {
NetOp net;
net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
"empty", framework::VariableNameMap{}, framework::VariableNameMap{},
framework::AttributeMap{}}));
net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
"empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
framework::AttributeMap{}}));
net.CompleteAddOp(true);
auto new_net_op = net.Clone();
ASSERT_NE(new_net_op, nullptr);
ASSERT_TRUE(new_net_op->IsNetOp());
auto* new_net = static_cast<NetOp*>(new_net_op.get());
ASSERT_EQ(2UL, new_net->ops_.size());
ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
}
} // namespace operators
} // namespace paddle
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......
......@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
mkldnn::memory({src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
mkldnn::memory({dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(output_data)));
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory);
......@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
mkldnn::memory({diff_src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(in_x_grad_data)));
auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
mkldnn::memory({diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(out_grad_data)));
auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future>
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
......@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
<< outs[i] << "back";
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
<< outs[i] << " back";
rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
outs[i]);
} else {
......@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
"(RPCClient) The RPC client object which will be"
"initialized at most once.");
AddOutput("Out",
"(SelectedRows) result "
"(LoDTensor) result "
"to be fetched from parameter server")
.AsDuplicable();
AddAttr<std::vector<std::string>>(
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h"
#include "paddle/fluid/operators/net_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
bool flip = ctx->Attrs().Get<bool>("flip");
std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
if (max_sizes.size() > 0) {
......
......@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/transform.h"
......@@ -22,23 +24,23 @@ namespace operators {
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip,
std::vector<float>& output_aspect_ratior) {
std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6;
output_aspect_ratior.clear();
output_aspect_ratior.push_back(1.0f);
output_aspect_ratior->clear();
output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i];
bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true;
break;
}
}
if (!already_exist) {
output_aspect_ratior.push_back(ar);
output_aspect_ratior->push_back(ar);
if (flip) {
output_aspect_ratior.push_back(1.0f / ar);
output_aspect_ratior->push_back(1.0f / ar);
}
}
}
......@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/rank_loss_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
......@@ -19,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/operators/net_op.h"
#include <string>
namespace paddle {
namespace operators {
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
std::unique_ptr<f::OperatorBase> listen_and_serv_op;
int selected_port;
void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
p::CPUDeviceContext ctx(place);
for (int i = 0; i < 2; ++i) {
auto var_name = paddle::string::Sprintf("x%d", i);
auto var = scope.Var(var_name);
auto var = scope->Var(var_name);
auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize({10, 10});
float *expect = tensor->mutable_data<float>(place);
......@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
}
}
auto out_var = scope.Var("Out");
auto out_var = scope->Var("Out");
auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate
}
void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
p::CPUDeviceContext ctx(place);
int64_t height = 10;
int64_t row_numel = 10;
m::SetConstant<p::CPUDeviceContext, float> set_one;
// init x0
std::vector<int64_t> rows0{0, 4, 7};
auto x0_var = scope.Var("x0");
auto x0_var = scope->Var("x0");
auto x0 = x0_var->GetMutable<f::SelectedRows>();
x0->set_rows(rows0);
x0->set_height(height);
......@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
// init x1
std::vector<int64_t> rows1{2, 9};
auto x1_var = scope.Var("x1");
auto x1_var = scope->Var("x1");
auto x1 = x1_var->GetMutable<f::SelectedRows>();
x1->set_rows(rows1);
x1->set_height(height);
......@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
set_one(ctx, x1_value, 1.0);
auto out_var = scope.Var("Out");
auto out_var = scope->Var("Out");
auto out = out_var->GetMutable<f::SelectedRows>();
auto out_value = out->mutable_value();
out->set_height(height);
......@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
f::Scope scope;
p::CPUPlace place;
if (is_sparse) {
InitSelectedRowsInScope(scope, place);
InitSelectedRowsInScope(place, &scope);
} else {
InitTensorsInScope(scope, place);
InitTensorsInScope(place, &scope);
}
// sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program;
const auto &root_block = program.Block(0);
auto *optimize_block = program.AppendBlock(root_block);
auto *prefetch_block = program.AppendBlock(root_block);
// X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
......@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
attrs.insert({"GradList", std::vector<std::string>({"x1"})});
attrs.insert({"OptimizeBlock", optimize_block});
attrs.insert({"PrefetchBlock", prefetch_block});
listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
LOG(INFO) << "selected port before run " << selected_port;
......@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
// local net
f::Scope scope;
p::CPUPlace place;
InitTensorsInScope(scope, place);
InitTensorsInScope(place, &scope);
// create rpc client var
scope.Var("RPC_CLIENT_VAR");
......@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
InitSelectedRowsInScope(scope, place);
InitSelectedRowsInScope(place, &scope);
scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs;
selected_port = static_cast<paddle::operators::ListenAndServOp *>(
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future>
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
......@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
auto ins = Inputs("X");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_sent");
int sync_send = Attr<int>("sync_send");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
......
......@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element");
auto param_dim = ctx->GetInputDim("Param");
// TODO(qijun): check dimensions of Param and Grad at complie
// and run time.
// TODO(qijun): check dimensions of Param and Grad at compile
// and runtime.
ctx->SetOutputDim("ParamOut", param_dim);
}
......
......@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front();
PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
}
};
......@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto *input_var = block->Var(op_desc.Input("Ids")[0]);
for (auto &out_var : op_desc.Output("Out")) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
block->Var(out_var)->SetType(input_var->GetType());
}
}
};
......@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
ops::SplitIdsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
......@@ -24,35 +24,63 @@ namespace operators {
template <typename DeviceContext, typename T>
class SplitIdsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
if (!platform::is_cpu_place(place)) {
PADDLE_THROW("SplitIds do not support GPU kernel");
}
auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const size_t shard_num = outs.size();
const auto *ids_var = ctx.InputVar("Ids");
if (ids_var->IsType<framework::LoDTensor>()) {
const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size());
std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size());
// split id by their shard_num.
for (int i = 0; i < ids_dims[0]; ++i) {
T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id);
}
// split id by their shard_num.
for (int i = 0; i < ids_dims[0]; ++i) {
T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id);
}
// create tensor for each shard and send to parameter server
for (size_t i = 0; i < out_ids.size(); ++i) {
auto *shard_t = outs[i];
std::vector<T> ids = out_ids[i];
auto *shard_data = shard_t->mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
for (size_t i = 0; i < ids.size(); ++i) {
shard_data[i] = ids[i];
}
}
} else if (ids_var->IsType<framework::SelectedRows>()) {
const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
auto &ids_dims = ids_selected_rows->value().dims();
PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
const T *ids = ids_selected_rows->value().data<T>();
const auto &ids_rows = ids_selected_rows->rows();
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
const size_t shard_num = outs.size();
// get rows for outputs
for (auto &id : ids_rows) {
size_t shard_id = static_cast<size_t>(id) % shard_num;
outs[shard_id]->mutable_rows()->push_back(id);
}
// create tensor for each shard and send to parameter server
for (size_t i = 0; i < out_ids.size(); ++i) {
auto* shard_t = outs[i];
std::vector<T> ids = out_ids[i];
auto* shard_data = shard_t->mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
for (size_t i = 0; i < ids.size(); ++i) {
shard_data[i] = ids[i];
int64_t row_width = ids_dims[1];
for (auto &out : outs) {
out->set_height(ids_selected_rows->height());
framework::DDim ddim = framework::make_ddim(
{static_cast<int64_t>(out->rows().size()), row_width});
T *output = out->mutable_value()->mutable_data<T>(ddim, place);
for (size_t i = 0; i < ddim[0]; ++i) {
memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
row_width * sizeof(T));
}
}
}
}
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/split_op.h"
#include "paddle/fluid/operators/net_op.h"
namespace paddle {
namespace operators {
......
......@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride,
const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) {
using namespace detail;
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
paddle::operators::detail::StridedCopyDimVisitor<T> func(
dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim);
}
......
......@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sum_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
......@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputsDim("X");
size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
if (N == 1) {
VLOG(3) << "Warning: sum have only one input, may waste memory";
}
framework::DDim in_dim({0});
for (auto& x_dim : x_dims) {
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
namespace paddle {
......@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src,
bool& firstStep, bool& is_empty,
Pair<T>& max, int dim,
bool* firstStep, bool* is_empty,
Pair<T>* max, int dim,
const int tid) {
if (beam > 0) {
int length = beam < beam_size ? beam : beam_size;
if (firstStep) {
firstStep = false;
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) {
topk[k] = topk[k + beam];
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-INFINITY, -1);
}
}
if (!is_empty) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length);
}
}
max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true;
beam = 0;
*max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* val,
int* col, bool& firstStep,
bool& is_empty, Pair<T>& max,
int* col, bool* firstStep,
bool* is_empty, Pair<T>* max,
int dim, const int tid) {
if (beam > 0) {
int length = beam < beam_size ? beam : beam_size;
if (firstStep) {
firstStep = false;
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) {
topk[k] = topk[k + beam];
if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-INFINITY, -1);
}
}
if (!is_empty) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
length);
}
}
max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true;
beam = 0;
*max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal,
int64_t** topIds, int& beam, int& k,
int64_t** topIds, int* beam, int* k,
const int tid, const int warp) {
while (true) {
__syncthreads();
......@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
(*topVal)++;
(*topIds)++;
}
if (tid == maxid[0]) beam++;
if (--k == 0) break;
if (tid == maxid[0]) (*beam)++;
if (--(*k) == 0) break;
__syncthreads();
if (tid == maxid[0]) {
if (beam < MaxLength) {
sh_topk[tid] = topk[beam];
if (*beam < MaxLength) {
sh_topk[tid] = topk[*beam];
}
}
if (maxid[0] / 32 == warp) {
if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
}
}
}
......@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
topk[k].set(-INFINITY, -1);
}
while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
src + blockIdx.x * lds, firststep,
is_empty, max, dim, tid);
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
src + blockIdx.x * lds, &firststep,
&is_empty, &max, dim, tid);
sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
&indices, beam, k, tid, warp);
&indices, &beam, &k, tid, warp);
}
}
......@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
KeMatrixTopK<T, 5, 256><<<
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream()>>>(output_data, output->dims()[1],
indices_data, input_data,
input_width, input_width, int(k));
.stream()>>>(
output_data, output->dims()[1], indices_data, input_data, input_width,
input_width, static_cast<int>(k));
}
};
......
......@@ -2,13 +2,13 @@ if(WITH_PYTHON)
if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
parallel_executor
${GLOB_OP_LIB})
else()
cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
parallel_executor
${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID)
......
......@@ -18,7 +18,6 @@ limitations under the License. */
#include <string>
#include <tuple>
#include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -125,23 +124,6 @@ void BindProgramDesc(pybind11::module *m) {
})
.def("append_block", &pd::ProgramDesc::AppendBlock,
pybind11::return_value_policy::reference)
.def("append_backward",
[](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
const std::unordered_set<std::string> &no_grad_vars) {
pd::ParamGradInfoMap param_grad_map =
AppendBackward(program_desc, target, no_grad_vars);
std::unordered_map<
std::string, std::tuple<std::string /* grad_var_name */,
int /* block_idx */, int /* op_idx */>>
retv;
for (auto it = param_grad_map.begin(); it != param_grad_map.end();
++it) {
const auto &grad_info = it->second;
retv[it->first] = std::make_tuple(
grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
}
return retv;
})
.def("block", &pd::ProgramDesc::MutableBlock,
pybind11::return_value_policy::reference)
.def("num_blocks", &pd::ProgramDesc::Size)
......
......@@ -20,9 +20,6 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
......@@ -31,18 +28,18 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/recordio.h"
#include "paddle/fluid/pybind/tensor_py.h"
......@@ -239,11 +236,6 @@ All parameter, weight, gradient are variables in Paddle.
},
py::return_value_policy::reference)
#endif
.def("get_net",
[](Variable &self) -> operators::NetOp * {
return self.GetMutable<operators::NetOp>();
},
py::return_value_policy::reference)
.def("get_reader",
[](Variable &self) -> framework::ReaderHolder * {
PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
......@@ -388,11 +380,6 @@ All parameter, weight, gradient are variables in Paddle.
desc.InitializationErrorString());
return OpRegistry::CreateOp(desc);
})
.def("backward",
[](const OperatorBase &forwardOp,
const std::unordered_set<std::string> &no_grad_vars) {
return Backward(forwardOp, no_grad_vars).release();
})
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CPUPlace &place) { self.Run(scope, place); })
......@@ -420,42 +407,6 @@ All parameter, weight, gradient are variables in Paddle.
[](const OperatorBase &op) { return op.OutputVars(false); })
.def("support_gpu", &OperatorBase::SupportGPU);
py::class_<operators::NetOp, OperatorBase>(m, "Net")
.def_static("create",
[]() -> operators::NetOp * {
auto *retv = new operators::NetOp;
retv->SetType("plain_net");
return retv;
})
.def("append_op", [](operators::NetOp &self,
const OperatorBase &op) { self.AppendOp(op); })
.def("complete_add_op", &operators::NetOp::CompleteAddOp)
.def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
self->CompleteAddOp();
});
// cond_op
py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
.def_static("create",
[](py::bytes protobin) -> operators::CondOp * {
proto::OpDesc desc;
PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
"Cannot parse user input to OpDesc");
PADDLE_ENFORCE(desc.IsInitialized(),
"User OpDesc is not initialized, reason %s",
desc.InitializationErrorString());
auto cond_op = OpRegistry::CreateOp(desc);
return static_cast<operators::CondOp *>(cond_op.release());
})
.def("set_truenet",
[](operators::CondOp &self, const operators::NetOp &net) -> void {
self.set_truenet(net.Clone());
})
.def("set_falsenet",
[](operators::CondOp &self, const operators::NetOp &net) -> void {
self.set_falsenet(net.Clone());
});
py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>())
.def("run",
......@@ -553,6 +504,7 @@ All parameter, weight, gradient are variables in Paddle.
bcast_vars, main_program, loss_var_name,
scope, local_scopes, allow_op_delay);
})
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
.def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
......
......@@ -14,13 +14,13 @@
#include "paddle/fluid/recordio/chunk.h"
#include <zlib.h>
#include <algorithm>
#include <memory>
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
#include "snappy_stream/include/snappystream.hpp"
#include "zlib/include/zlib.h"
#include "snappystream.hpp"
namespace paddle {
namespace recordio {
......
......@@ -13,6 +13,9 @@
// limitations under the License.
#include "paddle/fluid/recordio/header.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......
......@@ -231,7 +231,7 @@ function gen_fluid_inference_lib() {
Deploying fluid inference library ...
========================================
EOF
make inference_lib_dist
make -j `nproc` inference_lib_dist
fi
}
......
......@@ -29,6 +29,7 @@ import optimizer
import backward
import regularizer
import average
import metrics
from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import numpy as np
import warnings
"""
Class of all kinds of Average.
......@@ -22,6 +23,8 @@ import numpy as np
wrappers of Python functions.
"""
__all__ = ["WeightedAverage"]
def _is_number_(var):
return isinstance(var, int) or isinstance(var, float) or (isinstance(
......@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var):
class WeightedAverage(object):
def __init__(self):
warnings.warn(
"The %s is deprecated, please use fluid.metrics.Accuracy instead." %
(self.__class__.__name__), Warning)
self.reset()
def reset(self):
......
......@@ -13,14 +13,17 @@
# limitations under the License.
from __future__ import print_function
import framework
from framework import Program, default_main_program, default_startup_program, Parameter, Variable
import optimizer
from layer_helper import LayerHelper
import distributed_splitter as splitter
import math
import distributed_splitter as splitter
import framework
from framework import Program, default_main_program, Variable
from . import core
import debuger
LOOKUP_TABLE_TYPE = "lookup_table"
LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
class VarBlock:
......@@ -35,9 +38,9 @@ class VarBlock:
class UnionFind(object):
""" Union-find data struct.
""" Union-find data structure.
Union-find is a data struct that keeps track of a set of elements partitioned
Union-find is a data structure that keeps track of a set of elements partitioned
into a number of disjoint (non-overlapping) subsets.
Reference:
......@@ -185,19 +188,66 @@ class DistributeTranspiler:
assert (callable(split_method))
if program is None:
program = default_main_program()
self.program = program
self.trainers = trainers
self.origin_program = program
self.trainer_num = trainers
self.optimize_ops = optimize_ops
# TODO(typhoonzero): currently trainer_id is fetched from cluster system
# like Kubernetes, we should port this to use etcd later when developing
# fluid distributed training with fault-tolerance.
self.trainer_id = trainer_id
pserver_endpoints = pservers.split(",")
self.pserver_endpoints = pserver_endpoints
# process lookup_table_op
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table.
distributed_lookup_table_ops = []
# support only one distributed_lookup_table now
self.table_name = None
for op in program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attrs['is_distributed'] is True:
if self.table_name is None:
self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]:
raise RuntimeError("all distributed lookup_table_ops"
" should have only one table")
distributed_lookup_table_ops.append(op)
else:
if self.table_name is not None:
assert op.input("W")[0] != self.table_name
self.has_distributed_lookup_table = len(
distributed_lookup_table_ops) > 0
# step1: For large parameters and gradients, split them into smaller
# blocks.
param_list = [pg[0] for pg in params_grads]
grad_list = [pg[1] for pg in params_grads]
if self.has_distributed_lookup_table:
param_list = [
param for param in param_list if param.name != self.table_name
]
grad_list = [
grad for grad in grad_list
if grad.name != framework.grad_var_name(self.table_name)
]
self.table_param_grad = [
param_grad for param_grad in params_grads
if param_grad[0].name == self.table_name
][0]
table_grad_var = self.table_param_grad[1]
self.table_grad_list = [
program.global_block().create_var(
name="%s.trainer_%d.pserver_%d" %
(table_grad_var.name, trainer_id, index),
type=table_grad_var.type,
shape=table_grad_var.shape,
dtype=table_grad_var.dtype)
for index in range(len(self.pserver_endpoints))
]
grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
# step2: Create new vars for the parameters and gradients blocks and
......@@ -229,7 +279,7 @@ class DistributeTranspiler:
self.param_grad_ep_mapping[ep]["grads"].append(grad)
rpc_client_var = program.global_block().create_var(
name="RPC_CLIENT_VAR",
name=RPC_CLIENT_VAR_NAME,
persistable=True,
type=core.VarDesc.VarType.RAW)
......@@ -252,12 +302,19 @@ class DistributeTranspiler:
outputs={"Out": [orig_param]},
attrs={"axis": 0})
if self.has_distributed_lookup_table:
self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
eplist)
self._split_table_grad_and_add_send_vars(program, rpc_client_var,
pserver_endpoints)
def get_trainer_program(self):
# remove optimize ops and add a send op to main_program
self.program.global_block().delete_ops(self.optimize_ops)
self.origin_program.global_block().delete_ops(self.optimize_ops)
self.origin_program.sync_with_cpp()
# FIXME(typhoonzero): serialize once will fix error occurs when clone.
self.program.__str__()
return self.program
self.origin_program.__str__()
return self.origin_program
def get_pserver_program(self, endpoint):
"""
......@@ -293,8 +350,8 @@ class DistributeTranspiler:
type=v.type,
dtype=v.dtype,
shape=v.shape)
if self.trainers > 1:
for trainer_id in xrange(self.trainers):
if self.trainer_num > 1:
for trainer_id in xrange(self.trainer_num):
var = pserver_program.global_block().create_var(
name="%s.trainer_%d" % (orig_var_name, trainer_id),
persistable=False,
......@@ -308,7 +365,7 @@ class DistributeTranspiler:
# step3
optimize_block = pserver_program.create_block(0)
# step 4
# Create a union-find data struct from optimize ops,
# Create a union-find data structure from optimize ops,
# If two ops are connected, we could add these two ops
# into one set.
ufind = self._create_ufind(self.optimize_ops)
......@@ -383,6 +440,23 @@ class DistributeTranspiler:
# __append_optimize_op__(glb_op, optimize_block)
# break
# process distributed lookup_table
prefetch_block = None
if self.has_distributed_lookup_table:
pserver_index = self.pserver_endpoints.index(endpoint)
self._create_table_optimize_block(pserver_index, pserver_program,
append_block)
prefetch_block = self._create_prefetch_block(
pserver_index, pserver_program, optimize_block)
# NOTE: if has_distributed_lookup_table is False, then prefetch_block will
# not be executed, so it's safe to use optimize_block to hold the place
if self.has_distributed_lookup_table:
assert prefetch_block is not None
else:
assert prefetch_block is None
prefetch_block = pserver_program.global_block()
# step5 append the listen_and_serv op
pserver_program.global_block().append_op(
type="listen_and_serv",
......@@ -391,8 +465,10 @@ class DistributeTranspiler:
attrs={
"OptimizeBlock": optimize_block,
"endpoint": endpoint,
"Fanin": self.trainers
"Fanin": self.trainer_num,
"PrefetchBlock": prefetch_block
})
pserver_program.sync_with_cpp()
return pserver_program
......@@ -450,6 +526,197 @@ class DistributeTranspiler:
attrs=op.attrs)
return s_prog
# transpiler function for dis lookup_table
def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
eplist):
# 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
self.prefetch_input_vars = None
self.prefetch_output_vars = None
continue_search_lookup_table_op = True
while continue_search_lookup_table_op:
continue_search_lookup_table_op = False
all_ops = program.global_block().ops
for op in all_ops:
if op.type == LOOKUP_TABLE_TYPE:
continue_search_lookup_table_op = True
op_index = list(all_ops).index(op)
ids_name = op.input("Ids")
out_name = op.output("Out")
if self.prefetch_input_vars is None:
ids_var = program.global_block().vars[ids_name[0]]
self.prefetch_input_vars = self.create_splited_vars(
source_var=ids_var,
block=program.global_block(),
tag="_prefetch_in_")
if self.prefetch_output_vars is None:
out_var = program.global_block().vars[out_name[0]]
self.prefetch_output_vars = self.create_splited_vars(
source_var=out_var,
block=program.global_block(),
tag="_prefetch_out_")
# insert split_ids_op
program.global_block().insert_op(
index=op_index,
type="split_ids",
inputs={
'Ids': [
program.global_block().vars[varname]
for varname in ids_name
]
},
outputs={"Out": self.prefetch_input_vars})
# insert prefetch_op
program.global_block().insert_op(
index=op_index + 1,
type="prefetch",
inputs={'X': self.prefetch_input_vars},
outputs={
"Out": self.prefetch_output_vars,
"RPCClient": rpc_client_var
},
attrs={"epmap": eplist})
# insert concat_op
program.global_block().insert_op(
index=op_index + 2,
type="concat",
inputs={'X': self.prefetch_output_vars},
outputs={
"Out": [
program.global_block().vars[varname]
for varname in out_name
]
},
attrs={"axis": 0})
# delete lookup_table_op
program.global_block().delete_ops([op])
program.sync_with_cpp()
# break for loop
break
def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
pserver_endpoints):
# 2. add split_ids_op and send_vars_op to send gradient to pservers
# there should only be one table_name
all_ops = program.global_block().ops
table_grad_name = framework.grad_var_name(self.table_name)
for op in all_ops:
if table_grad_name in op.output_arg_names:
op_index = list(all_ops).index(op)
# insert split_ids_op
program.global_block().insert_op(
index=op_index + 1,
type="split_ids",
inputs={
'Ids': [program.global_block().vars[table_grad_name]]
},
outputs={"Out": self.table_grad_list})
program.global_block().insert_op(
index=op_index + 2,
type="send_vars",
inputs={'X': self.table_grad_list},
outputs={"RPCClient": rpc_client_var},
attrs={"sync_send": True,
"epmap": pserver_endpoints})
break
def _create_prefetch_block(self, pserver_index, pserver_program,
optimize_block):
# STEP: create prefetch block
table_var = pserver_program.global_block().vars[self.table_name]
prefetch_block = pserver_program.create_block(optimize_block.idx)
trainer_ids = self.prefetch_input_vars[pserver_index]
pserver_ids = pserver_program.global_block().create_var(
name=trainer_ids.name,
type=trainer_ids.type,
shape=trainer_ids.shape,
dtype=trainer_ids.dtype)
trainer_out = self.prefetch_output_vars[pserver_index]
pserver_out = pserver_program.global_block().create_var(
name=trainer_out.name,
type=trainer_out.type,
shape=trainer_out.shape,
dtype=trainer_out.dtype)
prefetch_block.append_op(
type=LOOKUP_TABLE_TYPE,
inputs={'Ids': pserver_ids,
"W": table_var},
outputs={"Out": pserver_out},
attrs={
"is_sparse": True, # has no effect on lookup_table op
"is_distributed": True,
"padding_idx": -1
})
return prefetch_block
def _create_table_optimize_block(self, pserver_index, pserver_program,
append_block):
def _clone_var(block, var, persistable=True):
assert isinstance(var, Variable)
return block.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
persistable=persistable)
# STEP: create table optimize block
# create table param and grad var in pserver program
param_var = _clone_var(
pserver_program.global_block(),
self.origin_program.global_block().vars[self.table_name])
grad_var = _clone_var(
pserver_program.global_block(),
self.origin_program.global_block().vars[framework.grad_var_name(
self.table_name)],
persistable=False)
# create grad vars in pserver program
table_grad_var = self.table_param_grad[1]
table_grad_list = [
pserver_program.global_block().create_var(
name="%s.trainer_%d.pserver_%d" %
(table_grad_var.name, index, pserver_index),
type=table_grad_var.type,
shape=table_grad_var.shape,
dtype=table_grad_var.dtype) for index in range(self.trainer_num)
]
# create table optimize block in pserver program
table_opt_op = [
op for op in self.optimize_ops
if op.input("Param")[0] == self.table_name
][0]
table_opt_block = pserver_program.create_block(append_block.idx)
# only support sgd now
assert table_opt_op.type == "sgd"
# append sum op for table_grad_list
table_opt_block.append_op(
type="sum",
inputs={"X": table_grad_list},
outputs={"Out": [grad_var]})
lr_var = pserver_program.global_block().vars[table_opt_op.input(
"LearningRate")[0]]
inputs = {
"Param": [param_var],
"Grad": [grad_var],
"LearningRate": [lr_var]
}
outputs = {"ParamOut": [param_var]}
table_opt_block.append_op(
type=table_opt_op.type,
inputs=inputs,
outputs=outputs,
attrs=table_opt_op.attrs)
# ====================== private transpiler functions =====================
def _create_vars_from_blocklist(self,
program,
......@@ -511,7 +778,17 @@ class DistributeTranspiler:
program.global_block().sync_with_cpp()
return var_mapping
def _clone_var(self, block, var):
def create_splited_vars(self, source_var, block, tag):
return [
block.create_var(
name=str(source_var.name + tag + str(index)),
type=source_var.type,
shape=source_var.shape,
dtype=source_var.dtype)
for index in range(len(self.pserver_endpoints))
]
def _clone_var(self, block, var, persistable=True):
assert isinstance(var, Variable)
return block.create_var(
name=var.name,
......@@ -519,12 +796,12 @@ class DistributeTranspiler:
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=True)
persistable=persistable)
def _append_split_op(self, program, gradblocks):
# Split variables that need to be split and append respective ops
add_suffix = False
if self.trainers > 1:
if self.trainer_num > 1:
add_suffix = True
var_mapping = self._create_vars_from_blocklist(
program, gradblocks, add_trainer_suffix=add_suffix)
......@@ -615,9 +892,9 @@ class DistributeTranspiler:
return
merged_var = \
pserver_block.vars[self._orig_varname(grad_block.name)]
if self.trainers > 1:
if self.trainer_num > 1:
vars2merge = []
for i in xrange(self.trainers):
for i in xrange(self.trainer_num):
per_trainer_name = "%s.trainer_%d" % \
(self._orig_varname(grad_block.name), i)
vars2merge.append(pserver_block.vars[per_trainer_name])
......@@ -632,7 +909,7 @@ class DistributeTranspiler:
type="scale",
inputs={"X": merged_var},
outputs={"Out": merged_var},
attrs={"scale": 1.0 / float(self.trainers)})
attrs={"scale": 1.0 / float(self.trainer_num)})
new_inputs[key] = merged_var
elif key == "Param":
# param is already created on global program
......@@ -668,7 +945,7 @@ class DistributeTranspiler:
new_shape = None
if key in ["Param", "Grad", "LearningRate"]:
continue
var = self.program.global_block().vars[opt_op.input(key)[0]]
var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape
param_shape = new_inputs["Param"].shape
new_shape = self._get_optimizer_input_shape(opt_op.type, key,
......@@ -681,8 +958,8 @@ class DistributeTranspiler:
new_inputs[key] = tmpvar
# change output's ParamOut variable
outputs = self._get_output_map_from_op(self.program.global_block().vars,
opt_op)
outputs = self._get_output_map_from_op(
self.origin_program.global_block().vars, opt_op)
outputs["ParamOut"] = new_inputs["Param"]
optimize_block.append_op(
......@@ -694,8 +971,8 @@ class DistributeTranspiler:
def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
program = optimize_block.program
# Append the ops for parameters that do not need to be optimized/updated
inputs = self._get_input_map_from_op(self.program.global_block().vars,
opt_op)
inputs = self._get_input_map_from_op(
self.origin_program.global_block().vars, opt_op)
for varlist in inputs.itervalues():
if not isinstance(varlist, list):
varlist = [varlist]
......@@ -708,8 +985,8 @@ class DistributeTranspiler:
dtype=var.dtype,
shape=var.shape)
outputs = self._get_output_map_from_op(self.program.global_block().vars,
opt_op)
outputs = self._get_output_map_from_op(
self.origin_program.global_block().vars, opt_op)
for varlist in outputs.itervalues():
if not isinstance(varlist, list):
......@@ -782,7 +1059,6 @@ class DistributeTranspiler:
if same_or_split_var(n, param) and n != param:
return True
return False
return False
def _get_input_map_from_op(self, varmap, op):
"""Returns a dict from op input name to the vars in varmap."""
......@@ -820,7 +1096,7 @@ class DistributeTranspiler:
find_ops = []
# find ops which output is lr var
block = self.program.global_block()
block = self.origin_program.global_block()
for op in block.ops:
if set(op.output_arg_names) & lr_vars:
find_ops.append(op)
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import numpy as np
import layers
......@@ -59,6 +60,9 @@ class Evaluator(object):
"""
def __init__(self, name, **kwargs):
warnings.warn(
"The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
% (self.__class__.__name__, self.__class__.__name__), Warning)
self.states = []
self.metrics = []
self.helper = LayerHelper(name, **kwargs)
......
......@@ -1119,24 +1119,6 @@ class Program(object):
def current_block(self):
return self.blocks[self.current_block_idx]
def append_backward(self, target, no_grad_set=None):
"""
return map(param_name -> (grad_name, block_index, op_index))
"""
assert isinstance(target, Variable)
if no_grad_set is None:
no_grad_set = set()
try:
param_to_grad_info = self.desc.append_backward(target.desc,
no_grad_set)
except Exception as e:
raise core.EnforceNotMet(
str(e) + "\nCurrent protobuf is\n{0}".format(
self.to_string(False)))
self.sync_with_cpp()
return param_to_grad_info
def create_block(self, parent_idx=None):
new_block_idx = len(self.blocks)
parent = self.current_block() if parent_idx is None else self.block(
......@@ -1201,6 +1183,8 @@ class Parameter(Variable):
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None)
def __str__(self):
return self.to_string(True)
......@@ -1221,7 +1205,7 @@ class Parameter(Variable):
if with_details:
res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr")
"gradient_clip_attr", "do_model_average")
for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name,
str(getattr(self, attr_name)))
......
......@@ -18,7 +18,8 @@ import contextlib
__all__ = [
'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
'init_on_cpu'
'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
'NormalInitializer', 'XavierInitializer'
]
_force_init_on_cpu_ = False
......
......@@ -15,12 +15,13 @@
All layers just related to metric.
"""
import warnings
from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant
from ..framework import Variable
from ..param_attr import ParamAttr
__all__ = ['accuracy']
__all__ = ['accuracy', 'auc']
def accuracy(input, label, k=1, correct=None, total=None):
......@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None):
"Total": [total],
})
return acc_out
def auc(input, label, curve='ROC', num_thresholds=200):
warnings.warn(
"This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
but can not aggregate them and get the pass AUC, because pass \
auc can not be averaged with weighted from the minibatch auc value. \
Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
which can get every minibatch and every pass auc value.", Warning)
helper = LayerHelper("auc", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
auc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
if total is None:
total = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="accuracy",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Label": [label]
},
attrs={"curve": curve,
"num_thresholds": num_thresholds},
outputs={"AUC": [auc_out], })
return auc_out
......@@ -218,6 +218,7 @@ def fc(input,
def embedding(input,
size,
is_sparse=False,
is_distributed=False,
padding_idx=None,
param_attr=None,
dtype='float32'):
......@@ -268,8 +269,11 @@ def embedding(input,
inputs={'Ids': input,
'W': w},
outputs={'Out': tmp},
attrs={'is_sparse': is_sparse,
'padding_idx': padding_idx})
attrs={
'is_sparse': is_sparse,
'is_distributed': is_distributed,
'padding_idx': padding_idx
})
return tmp
......@@ -1516,7 +1520,8 @@ def batch_norm(input,
in_place=False,
name=None,
moving_mean_name=None,
moving_variance_name=None):
moving_variance_name=None,
do_model_average_for_mean_and_var=False):
"""
This function helps create an operator to implement
the BatchNorm layer using the configurations from the input parameters.
......@@ -1547,7 +1552,10 @@ def batch_norm(input,
mean = helper.create_parameter(
attr=ParamAttr(
name=moving_mean_name, initializer=Constant(0.0), trainable=False),
name=moving_mean_name,
initializer=Constant(0.0),
trainable=False,
do_model_average=do_model_average_for_mean_and_var),
shape=param_shape,
dtype=input.dtype)
mean.stop_gradient = True
......@@ -1556,7 +1564,8 @@ def batch_norm(input,
attr=ParamAttr(
name=moving_variance_name,
initializer=Constant(1.0),
trainable=False),
trainable=False,
do_model_average=do_model_average_for_mean_and_var),
shape=param_shape,
dtype=input.dtype)
variance.stop_gradient = True
......@@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
Here are some examples to explain it.
1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
is [6, 8], the reshape operator will transform x into a 2-D tensor with
is [6, 8], the reshape operator will transform x into a 2-D tensor with
shape [6, 8] and leaving x's data unchanged.
2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
specified is [2, 3, -1, 2], the reshape operator will transform x into a
4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
case, one dimension of the target shape is set to -1, the value of this
dimension is inferred from the total element number of x and remaining
case, one dimension of the target shape is set to -1, the value of this
dimension is inferred from the total element number of x and remaining
dimensions.
3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
......@@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
def pad(x, paddings, pad_value=0., name=None):
"""
Pads a tensor with a constant value given by :attr:`pad_value`, and the
padded width is specified by :attr:`paddings`.
padded width is specified by :attr:`paddings`.
Specifically, the number of values padded before the contents of :attr:`x`
in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
......@@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None):
x (Variable): The input tensor variable.
paddings (list): A list of integers. Its elements specify the padded
width before and after for each dimension in turn.
The length of :attr:paddings must be
The length of :attr:paddings must be
:math:`rank(x) \\times 2`.
pad_value (float): The constant value used to pad.
name(str|None): A name for this layer(optional). If set None, the layer
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fluid Metrics
The metrics are accomplished via Python natively.
"""
import numpy as np
import copy
import warnings
__all__ = [
'MetricBase',
'CompositeMetric',
'Accuracy',
'ChunkEvaluator',
'EditDistance',
'DetectionMAP',
'Auc',
]
def _is_numpy_(var):
return isinstance(var, (np.ndarray, np.generic))
def _is_number_(var):
return isinstance(var, int) or isinstance(var, float) or (isinstance(
var, np.ndarray) and var.shape == (1, ))
def _is_number_or_matrix_(var):
return _is_number_(var) or isinstance(var, np.ndarray)
class MetricBase(object):
"""
Base Class for all evaluators
Args:
name(str): The name of evaluator. such as, "accuracy". Used for generate
temporary variable name.
Interface:
Note(*) : the states is the attributes who not has _ prefix.
get_config(): print current states and configuration
reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
Please override this method.
update(): update states at every minibatch
eval(): get metric evaluation in numpy type.
"""
def __init__(self, name, **kwargs):
self._name = str(name) if name != None else self.__class__.__name__
self._kwargs = kwargs if kwargs != None else dict()
self.reset()
def __str__(self):
return self._name
def reset(self):
"""
states is the attributes who not has _ prefix.
reset the states of metrics.
"""
states = {
attr: value
for attr, value in self.__dict__.iteritems()
if not attr.startswith("_")
}
for attr, value in states.iteritems():
if isinstance(value, int):
setattr(self, attr, 0)
elif isinstance(value, float):
setattr(self, attr, .0)
elif isinstance(value, (np.ndarray, np.generic)):
setattr(self, attr, np.zeros_like(value))
else:
setattr(self, attr, None)
def get_config(self):
states = {
attr: value
for attr, value in self.__dict__.iteritems()
if not attr.startswith("_")
}
config = copy.deepcopy(self._kwargs)
config.update({"name": self._name, "states": copy.deepcopy(states)})
return config
def update(self):
raise NotImplementedError()
def eval(self):
raise NotImplementedError()
class CompositeMetric(MetricBase):
"""
Compute multiple metrics in each minibatch.
for example, merge F1, accuracy, recall into one Metric.
"""
def __init__(self, name=None, **kwargs):
super(CompositeMetric, self).__init__(name, kwargs)
self._metrics = []
def add_metric(self, metric):
if not isinstance(metric, MetricBase):
raise ValueError("SubMetric should be inherit from MetricBase.")
self._metrics.append(metric)
def eval(self):
ans = []
for m in self._metrics:
ans.append(m.eval())
return ans
class Accuracy(MetricBase):
"""
Accumulate the accuracy from minibatches and compute the average accuracy
for every pass.
Args:
name: the metrics name
Example:
minibatch_accuracy = fluid.layers.accuracy(pred, label)
accuracy_evaluator = fluid.metrics.Accuracy()
for epoch in PASS_NUM:
accuracy_evaluator.reset()
for data in batches:
loss = exe.run(fetch_list=[cost, minibatch_accuracy])
accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
accuracy = accuracy_evaluator.eval()
"""
def __init__(self, name=None):
super(Accuracy, self).__init__(name)
self.value = .0
self.weight = .0
def update(self, value, weight):
if not _is_number_or_matrix_(value):
raise ValueError(
"The 'value' must be a number(int, float) or a numpy ndarray.")
if not _is_number_(weight):
raise ValueError("The 'weight' must be a number(int, float).")
self.value += value * weight
self.weight += weight
def eval(self):
if self.weight == 0:
raise ValueError(
"There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
)
return self.value / self.weight
class ChunkEvalutor(MetricBase):
"""
Accumulate counter numbers output by chunk_eval from mini-batches and
compute the precision recall and F1-score using the accumulated counter
numbers.
"""
def __init__(self, name=None):
super(ChunkEvalutor, self).__init__(name)
self.num_infer_chunks = 0
self.num_label_chunks = 0
self.num_correct_chunks = 0
def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
if not _is_number_or_matrix_(num_infer_chunks):
raise ValueError(
"The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
)
if not _is_number_or_matrix_(num_label_chunks):
raise ValueError(
"The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
)
if not _is_number_or_matrix_(num_correct_chunks):
raise ValueError(
"The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
)
self.num_infer_chunks += num_infer_chunks
self.num_label_chunks += num_label_chunks
self.num_correct_chunks += num_correct_chunks
def eval(self):
precision = float(
self.num_correct_chunks
) / self.num_infer_chunks if self.num_infer_chunks else 0
recall = float(self.num_correct_chunks
) / self.num_label_chunks if self.num_label_chunks else 0
f1_score = float(2 * precision * recall) / (
precision + recall) if self.num_correct_chunks else 0
return precision, recall, f1_score
class EditDistance(MetricBase):
"""
Accumulate edit distance sum and sequence number from mini-batches and
compute the average edit_distance and instance error of all batches.
Args:
name: the metrics name
Example:
edit_distance_metrics = fluid.layers.edit_distance(input, label)
distance_evaluator = fluid.metrics.EditDistance()
for epoch in PASS_NUM:
distance_evaluator.reset()
for data in batches:
loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
distance_evaluator.update(*edit_distance_metrics)
distance, instance_error = distance_evaluator.eval()
In the above example:
'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass.
"""
def __init__(self, name):
super(EditDistance, self).__init__(name)
self.total_distance = .0
self.seq_num = 0
self.instance_error = 0
def update(self, distances, seq_num):
if not _is_numpy_(distances):
raise ValueError("The 'distances' must be a numpy ndarray.")
if not _is_number_(seq_num):
raise ValueError("The 'seq_num' must be a number(int, float).")
seq_right_count = np.sum(distances == 0)
total_distance = np.sum(distances)
self.seq_num += seq_num
self.instance_error += seq_num - seq_right_count
self.total_distance += total_distance
def eval():
if self.seq_num == 0:
raise ValueError(
"There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
)
avg_distance = self.total_distance / self.seq_num
avg_instance_error = self.instance_error / self.seq_num
return avg_distance, avg_instance_error
class DetectionMAP(MetricBase):
"""
Calculate the detection mean average precision (mAP).
TODO (Dang Qingqing): update the following doc.
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
"""
def __init__(self, name=None):
super(DetectionMAP, self).__init__(name)
# the current map value
self.value = .0
def update(self, value, weight):
if not _is_number_or_matrix_(value):
raise ValueError(
"The 'value' must be a number(int, float) or a numpy ndarray.")
if not _is_number_(weight):
raise ValueError("The 'weight' must be a number(int, float).")
self.value += value
self.weight += weight
def eval(self):
if self.weight == 0:
raise ValueError(
"There is no data in DetectionMAP Metrics. "
"Please check layers.detection_map output has added to DetectionMAP."
)
return self.value / self.weight
class Auc(MetricBase):
"""
Auc Metrics which adapts to binary classification.
Need to note that auc metrics compute the value via Python natively.
If you concern the speed, please use the fluid.layers.auc instead.
The `auc` function creates four local variables, `true_positives`,
`true_negatives`, `false_positives` and `false_negatives` that are used to
compute the AUC. To discretize the AUC curve, a linearly spaced set of
thresholds is used to compute pairs of recall and precision values. The area
under the ROC-curve is therefore computed using the height of the recall
values by the false positive rate, while the area under the PR-curve is the
computed using the height of the precision values by the recall.
Args:
name: metric name
curve: Specifies the name of the curve to be computed, 'ROC' [default] or
'PR' for the Precision-Recall-curve.
num_thresholds: The number of thresholds to use when discretizing the roc
curve.
"NOTE: only implement the ROC curve type via Python now."
"""
def __init__(self, name, curve='ROC', num_thresholds=200):
super(MetricBase, self).__init__(name, curve, num_thresholds)
self._curve = curve
self._num_thresholds = num_thresholds
self._epsilon = 1e-6
self.tp_list = np.ndarray((num_thresholds, ))
self.fn_list = np.ndarray((num_thresholds, ))
self.tn_list = np.ndarray((num_thresholds, ))
self.fp_list = np.ndarray((num_thresholds, ))
def update(self, labels, predictions, axis=1):
if not _is_numpy_(labels):
raise ValueError("The 'labels' must be a numpy ndarray.")
if not _is_numpy_(predictions):
raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions
thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
for i in range(num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# caculate TP, FN, TN, FP count
for idx_thresh, thresh in enumerate(thresholds):
tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels):
if lbl:
if predictions[i, 0] >= thresh:
tp += 1
else:
fn += 1
else:
if predictions[i, 0] >= thresh:
fp += 1
else:
tn += 1
tp_list[idx_thresh] += tp
fn_list[idx_thresh] += fn
tn_list[idx_thresh] += tn
fp_list[idx_thresh] += fp
def eval(self):
epsilon = self._epsilon
num_thresholds = self._num_thresholds
tpr = (tp_list.astype("float32") + epsilon) / (
tp_list + fn_list + epsilon)
fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
rec = (tp_list.astype("float32") + epsilon) / (
tp_list + fp_list + epsilon)
x = fpr[:num_thresholds - 1] - fpr[1:]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
auc_value = np.sum(x * y)
return auc_value
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from collections import defaultdict
from paddle.fluid.framework import Program
import framework
......@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
min_average_window, max_average_window and current update times.
Args:
params_grads: A list of parameter-grad variable pairs.
average_window_rate: The rate of average window.
params_grads: A list of parameter-grad variable pairs.
min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window.
......@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
"""
def __init__(self,
params_grads,
average_window_rate,
params_grads=None,
min_average_window=10000,
max_average_window=10000,
**kwargs):
......@@ -849,24 +849,37 @@ class ModelAverage(Optimizer):
self.average_window = average_window_rate
self.min_average_window = min_average_window
self.max_average_window = max_average_window
self.params_grads = params_grads
self.params_grads = [] if params_grads is None else params_grads
params = {}
for param, grad in self.params_grads:
if param.do_model_average != False:
params[param.name] = (param, grad)
for param in framework.default_main_program().global_block(
).all_parameters():
if param.name not in params and param.do_model_average != False:
grad = param.block.create_var(
name=unique_name.generate(".".join([param.name, 'tmp'])),
dtype=param.dtype,
persistable=False,
stop_gradient=True)
params[param.name] = (param, grad)
self.params_grads = params.values()
for param, grad in self.params_grads:
if grad is not None:
self._append_average_accumulate_op(param)
self._append_average_accumulate_op(param)
self.apply_program = Program()
block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program):
for param_grad in self.params_grads:
if param_grad[1] is not None:
self._add_average_apply_op(block, param_grad)
self._add_average_apply_op(block, param_grad)
self.restore_program = Program()
block = self.restore_program.global_block()
with program_guard(main_program=self.restore_program):
for param_grad in self.params_grads:
if param_grad[1] is not None:
self._add_average_restore_op(block, param_grad)
self._add_average_restore_op(block, param_grad)
def _add_average_apply_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
......
......@@ -100,9 +100,11 @@ class ParallelExecutor(object):
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
persistable_vars = [
self.persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, main.list_vars())
for v in filter(lambda var: \
var.persistable and var.type != core.VarDesc.VarType.RAW,
main.list_vars())
]
self.executor = core.ParallelExecutor(
......@@ -113,7 +115,7 @@ class ParallelExecutor(object):
p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient
]),
set(persistable_vars),
set(self.persistable_vars),
main.desc,
loss_name if loss_name else '',
scope,
......@@ -143,3 +145,6 @@ class ParallelExecutor(object):
self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
return [arr[i] for i in range(len(arr))]
def bcast_params(self):
self.executor.bcast_params(set(self.persistable_vars))
......@@ -28,13 +28,15 @@ class ParamAttr(object):
learning_rate=1.0,
regularizer=None,
trainable=True,
gradient_clip=None):
gradient_clip=None,
do_model_average=None):
self.name = name
self.initializer = initializer
self.learning_rate = learning_rate
self.regularizer = regularizer
self.trainable = trainable
self.gradient_clip = gradient_clip
self.model_average = do_model_average
def set_default_initializer(self, initializer):
if initializer is None:
......@@ -80,7 +82,8 @@ class ParamAttr(object):
},
'regularizer': self.regularizer,
'trainable': self.trainable,
'gradient_clip_attr': self.gradient_clip
'gradient_clip_attr': self.gradient_clip,
'model_average': self.model_average
}
if with_initializer:
kwargs['initializer'] = self.initializer
......@@ -90,7 +93,7 @@ class ParamAttr(object):
class WeightNormParamAttr(ParamAttr):
"""
Used for weight normalization. Any field in ParamAttr can also be set here.
Besides, an extra field dim can be set to indicate the dimension except
Besides, an extra field dim can be set to indicate the dimension except
which to normalize.
"""
# List to record the parameters reparameterized by weight normalization.
......
......@@ -37,7 +37,7 @@ depth = 8
mix_hidden_lr = 1e-3
IS_SPARSE = True
PASS_NUM = 10
PASS_NUM = 100
BATCH_SIZE = 10
embedding_name = 'emb'
......@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
......@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
])
lstm = fluid.layers.dynamic_lstm(
......@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
])
return feature_out
......@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
# check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.0001,
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
......@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
print("second per batch: " + str((time.time(
) - start_time) / batch_id))
# Set the threshold low to speed up the CI test
if float(pass_precision) > 0.05:
if float(pass_precision) > 0.01:
if save_dirname is not None:
# TODO(liuyiqun): Change the target to crf_decode
fluid.io.save_inference_model(save_dirname, [
......
......@@ -14,23 +14,13 @@
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest
from paddle.fluid.framework import grad_var_name
def get_backward_op(scope, op, no_grad_set):
backward_op = core.Operator.backward(op, no_grad_set)
for input in backward_op.input_vars():
var = scope.var(input)
var.get_tensor()
for output in backward_op.output_vars():
var = scope.var(output)
var.get_tensor()
return backward_op
def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
x_shape = x.shape
if len(x_shape) == 2:
......@@ -64,11 +54,6 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
if data_format == "NCHW":
n, c, h, w = x.shape
......@@ -88,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
offset_tile = np.reshape(offset, (1, c, 1, 1))
offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
y = normalized * scale_tile + offset_tile
if len(x_shape) == 2:
y = np.reshape(y, (y.shape[0], y.shape[1]))
return y, mean, var
elif data_format == "NHWC":
x_square = x * x
......@@ -100,59 +83,42 @@ def _reference_training(x, scale, offset, epsilon, data_format):
var = x_square_sum / element_count - mean * mean
normalized = (x - mean) / np.sqrt(var + epsilon)
y = normalized * scale + offset
if len(x_shape) == 2:
y = np.reshape(y, x_shape)
return y, mean, var
else:
raise ValueError("Unknown data order.")
def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
# Use the following formulas to calculate gradients:
# grad_scale =
# sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
#
# grad_offset = sum(output_y)
#
# grad_x =
# x_grad =
# 1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
# (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
# transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], grad_y.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], 1, 1, grad_y.shape[1]))
if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
grad_y = np.transpose(grad_y, (0, 2, 3, 1))
y_grad = np.transpose(y_grad, (0, 2, 3, 1))
# raise ValueError("data_format must be NHWC, got %s." % data_format)
grad_x = scale * (grad_y - np.mean(
grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
grad_y * (x - mean), axis=(0, 1, 2)) /
x_grad = scale * (y_grad - np.mean(
y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
y_grad * (x - mean), axis=(0, 1, 2)) /
(var + epsilon)) / np.sqrt(var + epsilon)
grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
axis=(0, 1, 2))
grad_offset = np.sum(grad_y, axis=(0, 1, 2))
grad_offset = np.sum(y_grad, axis=(0, 1, 2))
# transfer back to N, C, H, W
if data_format == "NCHW":
grad_x = np.transpose(grad_x, (0, 3, 1, 2))
x_grad = np.transpose(x_grad, (0, 3, 1, 2))
x = np.transpose(x, (0, 3, 1, 2))
grad_y = np.transpose(grad_y, (0, 3, 1, 2))
y_grad = np.transpose(y_grad, (0, 3, 1, 2))
if len(x_shape) == 2:
grad_x = np.reshape(grad_x, x_shape)
return grad_x, grad_scale, grad_offset
return x_grad, grad_scale, grad_offset
def create_or_get_tensor(scope, var_name, var, place):
......@@ -186,7 +152,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
__set_tensor__(output, data)
class TestBatchNormOpInference(OpTest):
class TestBatchNormOpInference(unittest.TestCase):
def setUp(self):
self.dtype = np.float32
......@@ -304,231 +270,121 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
self.check_with_place(place, data_format, self.dtype, [2, 3])
class TestBatchNormOpTraining(OpTest):
class TestBatchNormOpTraining(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
if not np.allclose(np.array(tensor), np_array, atol=atol):
import pdb
pdb.set_trace()
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_python_testing(self):
data_format = "NHWC"
epsilon = 0.00001
n, h, w, c = 2, 3, 4, 5
x_shape = [n, h, w, c]
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
epsilon, "NHWC")
# running N, C, H, W case
# should produce the same results
x_shape2 = [n, c, h, w]
x_val2 = np.transpose(x_val, (0, 3, 1, 2))
y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
epsilon, "NCHW")
# transfer (N, C, H, W) back to (N, H, W, C)
y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
self.__assert_close(y_out, y_out2_trans, "inference output")
print 'python: NHWC, NCHW, inference checking passed'
def test_python_training(self):
data_format = "NHWC"
epsilon = 0.00001
momentum = 0.9
# N, H, W, C: 2, 3, 4, 2
n, h, w, c = 2, 3, 4, 5
x_shape = [n, h, w, c]
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
# run forward
y_out, saved_mean, var_ref = _reference_training(
x_val, scale_val, bias_val, epsilon, "NHWC")
#
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = var_ref * (1. - momentum) + momentum * variance
saved_variance = 1. / np.sqrt(var_ref + epsilon)
# running N, C, H, W case
# should produce the same results
x_shape2 = [n, c, h, w]
x_val2 = np.transpose(x_val, (0, 3, 1, 2))
y_out2, saved_mean2, var_ref2 = _reference_training(
x_val2, scale_val, bias_val, epsilon, "NCHW")
self.__assert_close(saved_mean, saved_mean2, "batch mean")
self.__assert_close(var_ref, var_ref2, "batch variance")
# transfer (N, C, H, W) back to (N, H, W, C)
y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
self.__assert_close(y_out, y_out2_trans, "batch output")
print 'python: NHWC, NCHW, forward checking passed'
# test backward now
# NHWC
self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
y_grad = self.y_grad
# y_grad = np.ones(x_shape).astype(np.float32)
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
# NCHW
y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
# y_grad2 = np.ones(x_shape2).astype(np.float32)
x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
print 'python: NHWC, NCHW, backward checking passed'
def test_forward_backward(self):
def test_with_place(place, data_layout, shape):
# attr
epsilon = 0.00001
momentum = 0.9
if len(shape) == 2:
x_shape = shape
c = shape[1]
if data_layout == "NCHW":
n, c, h, w = shape[0], shape[1], shape[2], shape[3]
else:
# n, h, w, c = 2, 3, 4, 2
n, h, w, c = shape[0], shape[1], shape[2], shape[3]
if data_format == "NHWC":
x_shape = [n, h, w, c]
elif data_format == "NCHW":
x_shape = [n, c, h, w]
else:
raise ValueError("Unknown data type.")
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
np.random.seed(123)
x = np.random.random_sample(shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
# run forward
y_out, saved_mean, var_ref = _reference_training(
x_val, scale_val, bias_val, epsilon, data_format)
# update moving mean and variance
y, saved_mean, var_ref = _reference_training(x, scale, bias,
epsilon, data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = var_ref * (1. - momentum) + momentum * variance
saved_variance = 1. / np.sqrt(var_ref + epsilon)
# for gradient test
# y_grad = np.ones(x_shape).astype(np.float32)
y_grad = np.zeros(x_shape).astype(np.float32)
if len(y_grad.shape) == 2:
y_grad[0, 0] = 1.
else:
y_grad[0, 0, 0, 0] = 1.
# y_grad = np.random.random_sample(x_shape).astype(np.float32)
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
data_format)
scope = core.Scope()
# create input
x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
place)
bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
place)
mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
variance_tensor = create_or_get_tensor(scope, "variance", variance,
place)
# create output
y_tensor = create_or_get_tensor(scope, "y_out", None, place)
saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
place)
saved_variance_tensor = create_or_get_tensor(
scope, "saved_variance", None, place)
mean_out_tensor = mean_tensor
variance_out_tensor = variance_tensor
batch_norm_op = Operator(
"batch_norm",
# inputs
X="x_val",
Scale="scale_val",
Bias="bias_val",
Mean="mean",
Variance="variance",
# outputs
Y="y_out",
MeanOut="mean",
VarianceOut="variance",
SavedMean="saved_mean",
SavedVariance="saved_variance",
# attrs
is_test=False,
data_layout=data_layout,
momentum=momentum,
epsilon=epsilon)
batch_norm_op.run(scope, place)
# check forward result
self.__assert_close(y_tensor, y_out, "y_out")
self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
self.__assert_close(saved_variance_tensor, saved_variance,
"saved_variance")
self.__assert_close(mean_out_tensor, mean_out, "mean_out")
if isinstance(place, core.CUDAPlace):
atol = 5e-2
else:
atol = 1e-4
self.__assert_close(variance_out_tensor, variance_out,
"variance_out", atol)
print "op test forward passed: ", str(place), data_layout
# run backward
batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
set_output_grad(
scope,
["y_out", "mean", "variance", "saved_mean", "saved_variance"],
place,
feed_dict={"y_out": y_grad})
batch_norm_op_grad.run(scope, place)
x_grad_tensor = create_or_get_tensor(scope,
grad_var_name("x_val"), None,
place)
scale_grad_tensor = create_or_get_tensor(scope,
grad_var_name("scale_val"),
None, place)
bias_grad_tensor = create_or_get_tensor(scope,
grad_var_name("bias_val"),
None, place)
y_grad = np.random.random_sample(shape).astype(np.float32)
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
var_dict = locals()
var_dict['y@GRAD'] = y_grad
var_names = [
'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
'saved_variance'
]
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name,
dtype='float32',
shape=ground_truth[name].shape)
bn_op = block.append_op(
type="batch_norm",
inputs={
"X": block.var('x'),
"Scale": block.var('scale'),
"Bias": block.var('bias'),
"Mean": block.var('mean'),
"Variance": block.var('variance')
},
outputs={
"Y": block.var('y'),
"MeanOut": block.var('mean'), # share the same memory
"VarianceOut":
block.var('variance'), # share the same memory
"SavedMean": block.var('saved_mean'),
"SavedVariance": block.var('saved_variance')
},
attrs={
"momentum": momentum,
"epsilon": epsilon,
"is_test": False,
"data_layout": data_layout
})
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
# generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
bn_op.desc, set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
exe = fluid.Executor(place)
out = exe.run(
program,
feed={
name: var_dict[name]
for name in
['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
},
fetch_list=[
'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
'x@GRAD', 'scale@GRAD', 'bias@GRAD'
])
self.__assert_close(y, out[0], "y")
self.__assert_close(mean_out, out[1], "mean")
self.__assert_close(variance_out, out[2], "variance", 1e-3)
self.__assert_close(saved_mean, out[3], "saved_mean")
self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
self.__assert_close(x_grad, out[5], "x_grad")
self.__assert_close(scale_grad, out[6], "scale_grad")
self.__assert_close(bias_grad, out[7], "bias_grad")
# check gradient output
self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
print "op test backward passed: ", str(place), data_layout
print "op test forward passed: ", str(place), data_layout
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
......@@ -537,7 +393,6 @@ class TestBatchNormOpTraining(OpTest):
for place in places:
for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format, [2, 3, 4, 5])
test_with_place(place, data_format, [2, 3])
if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import paddle.fluid.core as core
import unittest
import numpy as np
from paddle.fluid.op import Operator, CondOp
class PySimpleCond(object):
'''
A simple implementation of dynamic if-else based on numpy
'''
def __init__(self):
array = [1] * 10
for i in range(1, 10, 2):
array[i] = 0
self.cond = np.array(array)
self.x = np.ones(shape=(10, 1)).astype("float32")
def forward(self):
self.index_t = np.where(self.cond == 1)
self.index_f = np.where(self.cond == 0)
y_t = self.x[self.index_t]
y_f = self.x[self.index_f]
y_t = y_t * 2.
y_f = y_f * (-2.)
output = np.zeros(shape=(10, 1))
output[self.index_t] = y_t
output[self.index_f] = y_f
return output
class PySimpleCondTest(unittest.TestCase):
def setUp(self):
self.condnn = PySimpleCond()
def test_forward(self):
output = self.condnn.forward()
def create_tensor(scope, name, shape, np_data):
tensor = scope.var(name).get_tensor()
tensor.set_dims(shape)
tensor.set(np_data, core.CPUPlace())
return tensor
class TestCondOp(unittest.TestCase):
'''
Test CondOp
equation:
cond = [True, False, True, False, ...]
y[index_t] = x[index_t] * 2.
y[index_f] = x[index_f] * -2.
outputs:
y
'''
def setUp(self):
self.py_cond = PySimpleCond()
def forward(self):
self.scope = core.Scope()
self.create_global_variables()
self.create_cond_op()
self.create_sub_net()
self.condop.run(self.scope, core.CPUPlace())
return np.array(self.scope.find_var("Out").get_tensor())
def create_global_variables(self):
x_np_data = self.py_cond.x
create_tensor(self.scope, "X", [10, 1], x_np_data)
cond_np_data = self.py_cond.cond.astype("int32")
create_tensor(self.scope, "cond", [10, 1], cond_np_data)
self.scope.var("SubScopes")
self.scope.var("IndexTensors")
self.scope.var("Out")
def create_cond_op(self):
self.condop = CondOp(
Cond="cond",
Xs=["X"],
Outs=["Out"],
SubScopes="SubScopes",
IndexTensors="IndexTensors")
def create_sub_net(self):
truenet = core.Net.create()
scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
truenet.append_op(scale_op_t)
truenet.complete_add_op(True)
self.condop.set_truenet(truenet)
falsenet = core.Net.create()
scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
falsenet.append_op(scale_op_t)
falsenet.complete_add_op(True)
self.condop.set_falsenet(falsenet)
def test_forward(self):
print 'test cond op forward'
pd_output = self.forward()
py_output = self.py_cond.forward()
print 'pd_output', pd_output
print
print 'py_output', py_output
self.assertEqual(pd_output.shape, py_output.shape)
print 'test passed'
return 0
if __name__ == "__main__":
unittest.main()
......@@ -15,10 +15,8 @@ import unittest
import numpy as np
from operator import mul
from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.framework import grad_var_name
import paddle.fluid as fluid
np.random.random(123)
......@@ -70,161 +68,93 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
return grad_x, d_scale, d_bias
def get_backward_op(scope, op, no_grad_set):
backward_op = core.Operator.backward(op, no_grad_set)
for input in backward_op.input_vars():
var = scope.var(input)
var.get_tensor()
for output in backward_op.output_vars():
var = scope.var(output)
var.get_tensor()
return backward_op
def create_or_get_tensor(scope, var_name, var, place):
tensor = scope.var(var_name).get_tensor()
if var is not None:
assert isinstance(var, np.ndarray)
tensor.set_lod([[]])
tensor.set_dims(var.shape)
tensor.set(var, place)
return tensor
def set_output_grad(scope, outputs, place, feed_dict=None):
def __set_tensor__(name, data=None):
out_tensor = scope.find_var(name).get_tensor()
grad_tensor = scope.var(grad_var_name(name)).get_tensor()
out_dtype = out_tensor.dtype()
if data is None:
if out_dtype == core.VarDesc.VarType.FP64:
data = np.ones(out_tensor.shape(), dtype=np.float64)
elif out_dtype == core.VarDesc.VarType.FP32:
data = np.ones(out_tensor.shape(), dtype=np.float32)
else:
raise ValueError("Not supported data type " + str(out_dtype))
grad_tensor.set(data, place)
for output in outputs:
data = None
if output in feed_dict:
data = feed_dict[output]
__set_tensor__(output, data)
class TestLayerNormdOp(OpTest):
class TestLayerNormdOp(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def __assert_grad_close(self,
tensor,
np_array,
name,
place,
max_relative_error=0.02):
a = np.array(tensor)
b = np_array
abs_a = np.abs(a)
abs_a[abs_a < 1e-5] = 1
diff_mat = np.abs(a - b) / abs_a
max_diff = np.max(diff_mat)
def err_msg():
offset = np.argmax(diff_mat > max_relative_error)
return ("%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d, %f, %f") % (
"Gradient Check On %s" % str(place), name, max_diff,
max_relative_error, offset, a.flatten()[offset],
b.flatten()[offset])
self.assertLessEqual(max_diff, max_relative_error, err_msg())
def check_forward_backward(self, shape, begin_norm_axis):
def test_with_place(place, shape, begin_norm_axis=1):
# setUp
assert begin_norm_axis > 0 and begin_norm_axis < len(
shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
def test_with_place(place, shape, begin_norm_axis):
# attr
epsilon = 0.00001
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
np.random.seed(123)
x = np.random.random_sample(x_shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
y_grad = np.random.random_sample(x_shape).astype(np.float32)
# run forward
y_out, saved_mean, var_ref = _reference_layer_norm_naive(
x_val, scale_val, bias_val, epsilon, begin_norm_axis)
naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
# get gradient
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
naive_grad = {
"X": x_grad_ref,
"Scale": scale_grad_ref,
"Bias": bias_grad_ref
}
scope = core.Scope()
# create input
input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
for i_name in input_map:
create_or_get_tensor(scope, i_name, input_map[i_name], place)
# create output
output_map = {"Y": None, "Mean": None, "Variance": None}
output_tensor = {}
for o_name in output_map:
output_tensor[o_name] = create_or_get_tensor(
scope, o_name, output_map[o_name], place)
layer_norm_op = Operator(
"layer_norm",
# inputs
X="X",
Scale="Scale",
Bias="Bias",
# outputs
Y="Y",
Mean="Mean",
Variance="Variance",
# attrs
epsilon=epsilon,
begin_norm_axis=begin_norm_axis)
layer_norm_op.run(scope, place)
# check forward result
atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
for o_tensor in output_tensor:
self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
o_tensor, atol)
# run backward
layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
set_output_grad(
scope, ["Y", "Mean", "Variance"],
place,
feed_dict={"Y": y_grad})
layer_norm_op_grad.run(scope, place)
# get output
grad_tensor = {}
for o_name in naive_grad:
grad_tensor[o_name] = x_ = create_or_get_tensor(
scope, grad_var_name(o_name), None, place)
# check gradient output
for o_grad in naive_grad:
self.__assert_grad_close(grad_tensor[o_grad],
naive_grad[o_grad], o_grad + "@GRAD",
place)
# reference forward & backward
y, mean, variance = _reference_layer_norm_naive(
x, scale, bias, epsilon, begin_norm_axis)
x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
x, y_grad, scale, mean, variance, begin_norm_axis)
var_dict = locals()
var_dict['y@GRAD'] = y_grad
var_names = [
'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
]
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name,
dtype='float32',
shape=ground_truth[name].shape)
layer_norm_op = block.append_op(
type="layer_norm",
inputs={
"X": block.var('x'),
"Scale": block.var('scale'),
"Bias": block.var('bias'),
},
outputs={
"Y": block.var('y'),
"Mean": block.var('mean'), # share the same memory
"Variance":
block.var('variance'), # share the same memory
},
attrs={
"epsilon": epsilon,
"begin_norm_axis": begin_norm_axis
})
# generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
layer_norm_op.desc, set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
exe = fluid.Executor(place)
out = exe.run(program,
feed={
name: var_dict[name]
for name in ['x', 'scale', 'bias', 'y@GRAD']
},
fetch_list=[
'y', 'mean', 'variance', 'x@GRAD',
'scale@GRAD', 'bias@GRAD'
])
self.__assert_close(y, out[0], "y")
self.__assert_close(mean, out[1], "mean")
self.__assert_close(variance, out[2], "variance", 1e-3)
self.__assert_close(x_grad, out[3], "x_grad")
self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
self.__assert_close(bias_grad, out[5], "bias_grad")
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
......@@ -237,15 +167,6 @@ class TestLayerNormdOp(OpTest):
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
def test_check_forward_backward_with_scale(self):
pass # TODO(zcd)
def test_check_forward_backward_with_bias(self):
pass # TODO(zcd)
def test_check_forward_backward(self):
pass # TODO(zcd)
if __name__ == '__main__':
unittest.main()
......@@ -32,7 +32,6 @@ class TestBook(unittest.TestCase):
cost = layers.square_error_cost(input=y_predict, label=y)
avg_cost = layers.mean(cost)
self.assertIsNotNone(avg_cost)
program.append_backward(avg_cost)
print(str(program))
......@@ -94,8 +93,6 @@ class TestBook(unittest.TestCase):
cost = layers.cross_entropy(input=predict, label=label)
avg_cost = layers.mean(cost)
program.append_backward(avg_cost)
print(str(program))
def test_word_embedding(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import unittest
def fc(X, W, Y):
ret_v = core.Net.create()
ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
ret_v.complete_add_op(True)
return ret_v
class TestNet(unittest.TestCase):
def test_net_all(self):
net = core.Net.create()
op1 = Operator("sum", X=["X", "Y"], Out="Out")
net.append_op(op1)
net2 = core.Net.create()
net2.append_op(fc(X="X", W="w", Y="fc.out"))
net2.complete_add_op(True)
net.append_op(net2)
net.complete_add_op(True)
expected = '''
Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
'''
self.assertEqual(expected, "\n" + str(net))
if __name__ == "__main__":
unittest.main()
......@@ -473,7 +473,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
loss = simple_fc_net(True)
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.0001)
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 32
......@@ -500,4 +500,8 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
train_loss = numpy.array(train_loss)
self.assertTrue(numpy.allclose(train_loss, test_loss))
self.assertTrue(
numpy.allclose(
train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss))
......@@ -87,57 +87,6 @@ class TestProgram(unittest.TestCase):
print(prog)
print(prog_restored)
def test_append_backward(self):
prog = Program()
block = prog.global_block()
mul_x = block.create_var(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
mul_op = block.append_op(
type="mul",
inputs={"X": [mul_x],
"Y": mul_y},
outputs={"Out": [mul_out]},
attrs={"x_num_col_dims": 1})
add_y = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
add_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
add_op = block.append_op(
type="elementwise_add",
inputs={"X": mul_out,
"Y": add_y},
outputs={"Out": add_out},
attrs={"x_num_col_dims": 1})
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op(
type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
self.assertEqual(mul_op.idx, 0)
self.assertEqual(add_op.idx, 1)
param_to_grad = prog.append_backward(mean_out, set())
for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
"mean.out"):
self.assertEqual(param_to_grad[var_name][0],
grad_var_name(var_name))
self.assertEqual(param_to_grad[var_name][1], 0)
expect_ops = [
"mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
"elementwise_add_grad", "mul_grad"
]
actual_ops = []
for op in block.ops:
actual_ops.append(op.type)
self.assertEqual(actual_ops, expect_ops)
def test_program_clone_with_parameter(self):
main_program = Program()
startup_program = Program()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册