提交 96f24213 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_dqn

...@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " ...@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32) if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
...@@ -62,6 +64,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) ...@@ -62,6 +64,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
...@@ -188,7 +191,14 @@ include(configure) # add paddle env configuration ...@@ -188,7 +191,14 @@ include(configure) # add paddle env configuration
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt) include(tensorrt)
include(anakin_subgraph)
endif()
if(WITH_GPU AND NOT WIN32)
message(STATUS "add dgc lib.")
include(external/dgc)
endif() endif()
if(WITH_MKL OR WITH_MKLML) if(WITH_MKL OR WITH_MKLML)
include(external/anakin) include(external/anakin)
elseif() elseif()
......
...@@ -156,7 +156,7 @@ python \ ...@@ -156,7 +156,7 @@ python \
This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows:
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math) - verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/)
...@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
else: else:
build_strategy.reduce_strategy = fluid.BuildStrategy( build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce ).ReduceStrategy.AllReduce
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
avg_loss = train_args[0] avg_loss = train_args[0]
......
if(NOT WITH_GPU)
return()
endif()
set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
find_path(ANAKIN_INCLUDE_DIR anakin_config.h
PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
$ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
NO_DEFAULT_PATH
)
find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
PATHS ${ANAKIN_ROOT}
$ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
NO_DEFAULT_PATH
DOC "Path to ANAKIN library.")
if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
if(WITH_DSO)
set(ANAKIN_FOUND ON)
endif(WITH_DSO)
else()
set(ANAKIN_FOUND OFF)
endif()
if(ANAKIN_FOUND)
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
include_directories(${ANAKIN_ROOT}/include)
include_directories(${ANAKIN_ROOT}/include/saber)
link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN)
endif()
...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost") ...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc")
SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
ExternalProject_Add(
extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet"
GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc"
SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND ""
BUILD_COMMAND cd collective && make -j
INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc
&& cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES}
&& cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
BUILD_IN_SOURCE 1
)
ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
ADD_DEPENDENCIES(dgc extern_dgc)
LIST(APPEND external_project_dependencies dgc)
...@@ -44,7 +44,7 @@ ExternalProject_Add( ...@@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party", # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party # checkout and clean other dirs under third_party
# 4. remove .git, and package the directory. # 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz" URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "1f268a2aff6759839dccd256adcc91cf" URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR} PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") ...@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
SET(TIME_VERSION "2019.0.1.20181227") SET(TIME_VERSION "2019.0.1.20181227")
IF(WIN32) IF(WIN32)
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
...@@ -43,7 +43,7 @@ ELSE() ...@@ -43,7 +43,7 @@ ELSE()
#TODO(intel-huying): #TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later. # Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
......
...@@ -57,20 +57,25 @@ SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) ...@@ -57,20 +57,25 @@ SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
ExternalProject_Add( ExternalProject_Add(
${NGRAPH_PROJECT} ${NGRAPH_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT}
GIT_REPOSITORY ${NGRAPH_GIT_REPO} GIT_REPOSITORY ${NGRAPH_GIT_REPO}
GIT_TAG ${NGRAPH_GIT_TAG} GIT_TAG ${NGRAPH_GIT_TAG}
PREFIX ${NGRAPH_SOURCES_DIR} PREFIX ${NGRAPH_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} CMAKE_GENERATOR ${CMAKE_GENERATOR}
CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM}
CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET}
CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE
CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
) )
add_dependencies(ngraph ${NGRAPH_PROJECT}) add_dependencies(ngraph ${NGRAPH_PROJECT})
......
...@@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32) ...@@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32)
) )
endif () endif ()
if (WITH_GPU AND NOT WIN32)
set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
copy(dgc_lib
SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
DSTS ${dgc_dir} ${dgc_dir}
DEPS dgc)
endif()
if (WITH_MKLDNN) if (WITH_MKLDNN)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
copy(mkldnn_lib copy(mkldnn_lib
......
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -33,5 +33,6 @@ if(TENSORRT_FOUND) ...@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
link_directories(${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT) add_definitions(-DPADDLE_WITH_TENSORRT)
endif() endif()
此差异已折叠。
...@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto ...@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(reader_test SRCS reader_test.cc DEPS reader) cc_test(reader_test SRCS reader_test.cc DEPS reader)
...@@ -164,6 +164,8 @@ else() ...@@ -164,6 +164,8 @@ else()
set(NGRAPH_EXE_DEPS) set(NGRAPH_EXE_DEPS)
endif() endif()
cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
...@@ -174,7 +176,7 @@ else() ...@@ -174,7 +176,7 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
target_link_libraries(executor garbage_collector while_op_helper) target_link_libraries(executor while_op_helper executor_gc_helper)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
...@@ -193,7 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) ...@@ -193,7 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc) proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......
...@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout = out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims()); std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz; std::vector<int> out_tz = in_tz;
...@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s", in.type()); "Input tensor type is not supported: %s", in.type());
memory::data_type out_type = in_type; memory::data_type out_type = in_type;
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
// output tensor has the same dims as input. Reorder don't change dims // output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims()); out->Resize(in.dims());
// tempory mem pd fr out , to make reorder if (in_format != out_format) {
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out->dims()),
mkldnn::memory::format::blocked, out_type);
if (in.get_mkldnn_prim_desc() != out_mem_pd) {
void* in_data = GetDataFromTensor(in, in_type); void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); auto in_memory =
auto out_memory = memory(out_mem_pd, out_data); memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
platform::Reorder(in_memory, out_memory); platform::Reorder(in_memory, out_memory);
} else { } else {
out->ShareDataWith(in); out->ShareDataWith(in);
} }
out->set_layout(out_layout); out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif #endif
} }
......
...@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type, ...@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur // Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor); out.ShareDataWith(input_tensor);
// TODO(jczaja): Remove that once all mkldnn ops out.set_layout(DataLayout::kMKLDNN);
// are modified to work with mkldnn_blocked out.set_format(out_format);
auto mkldnn_fmt = [&](int rank) {
switch (rank) {
case 5:
return mkldnn::memory::format::ncdhw;
case 4:
return mkldnn::memory::format::nchw;
case 3:
return mkldnn::memory::format::ncw;
case 2:
return mkldnn::memory::format::nc;
case 1:
return mkldnn::memory::format::x;
default:
return mkldnn::memory::format::blocked;
}
};
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out.dims()),
mkldnn_fmt(out.dims().size()));
out.set_mkldnn_prim_desc(out_mem_pd);
#endif #endif
} else { } else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
......
...@@ -5,11 +5,16 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h ...@@ -5,11 +5,16 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
...@@ -20,7 +25,13 @@ if(WITH_DISTRIBUTE) ...@@ -20,7 +25,13 @@ if(WITH_DISTRIBUTE)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
set(dgc_deps "")
if(NOT WIN32)
set(dgc_deps dgc)
endif()
nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor ${dgc_deps})
nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor) dynload_cuda variable_visitor)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
...@@ -35,6 +46,8 @@ if(WITH_GPU) ...@@ -35,6 +46,8 @@ if(WITH_GPU)
else() else()
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor) variable_visitor)
cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor sendrecvop_rpc) ddim selected_rows_functor sendrecvop_rpc)
...@@ -46,9 +59,7 @@ else() ...@@ -46,9 +59,7 @@ else()
cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
endif() endif()
cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
if(WITH_GPU) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
...@@ -69,7 +80,9 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap ...@@ -69,7 +80,9 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU) if (WITH_GPU)
...@@ -99,4 +112,6 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS ...@@ -99,4 +112,6 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass) memory_optimize_pass lock_free_optimize_pass
alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
fuse_adam_op_pass fuse_sgd_op_pass)
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -41,8 +42,7 @@ VarHandle* GetValidInput(const OpHandleBase* a) { ...@@ -41,8 +42,7 @@ VarHandle* GetValidInput(const OpHandleBase* a) {
return nullptr; return nullptr;
} }
std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
std::unique_ptr<ir::Graph> graph) const {
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph); auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
// get vars order // get vars order
...@@ -52,13 +52,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -52,13 +52,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
// Note that must assert topology sort is stable // Note that must assert topology sort is stable
auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs); auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) { for (auto* op_desc : ops) {
auto outputs = op_desc->Outputs(); try {
for (auto& o_it : outputs) { bool is_bk_op =
for (auto& v : o_it.second) { // values static_cast<bool>(boost::get<int>(op_desc->GetAttr(
vars[v] = order; OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;
auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
VLOG(10) << "in all_reduce_deps_pass:" << v;
}
} }
order++;
} catch (boost::bad_get e) {
} }
order++;
} }
std::vector<OpHandleBase*> dist_ops; std::vector<OpHandleBase*> dist_ops;
...@@ -70,7 +85,8 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -70,7 +85,8 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
} }
} }
VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl; VLOG(10) << "dist_ops size:" << dist_ops.size()
<< ", outputs size:" << vars.size() << ", ops size:" << ops.size();
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1, std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
OpHandleBase* op2) { OpHandleBase* op2) {
...@@ -83,6 +99,10 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -83,6 +99,10 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
auto l_it = vars.find(i0->name()); auto l_it = vars.find(i0->name());
auto r_it = vars.find(i1->name()); auto r_it = vars.find(i1->name());
PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
"can't find var's name %s and %s in opdesc", i0->name(),
i1->name());
if (l_it->second < r_it->second) return true; if (l_it->second < r_it->second) return true;
if (l_it->second == r_it->second) { if (l_it->second == r_it->second) {
...@@ -110,8 +130,6 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -110,8 +130,6 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
VLOG(10) << "pre_op:" << pre_op->DebugString() VLOG(10) << "pre_op:" << pre_op->DebugString()
<< ", op:" << op->DebugString(); << ", op:" << op->DebugString();
} }
return graph;
} }
} // namespace details } // namespace details
......
...@@ -24,8 +24,7 @@ namespace details { ...@@ -24,8 +24,7 @@ namespace details {
// TODO(gongwb): overlap allreduce with backward computation. // TODO(gongwb): overlap allreduce with backward computation.
class AllReduceDepsPass : public ir::Pass { class AllReduceDepsPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace details } // namespace details
......
...@@ -11,12 +11,18 @@ ...@@ -11,12 +11,18 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <algorithm>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/operator.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "dgc/dgc.h"
#endif
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
// asynchronous nccl allreduce or synchronous issue: // asynchronous nccl allreduce or synchronous issue:
...@@ -34,11 +40,14 @@ namespace details { ...@@ -34,11 +40,14 @@ namespace details {
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs) const platform::NCCLContextMap *ctxs,
bool is_encoded, int nranks)
: OpHandleBase(node), : OpHandleBase(node),
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
nccl_ctxs_(ctxs) { nccl_ctxs_(ctxs),
is_encoded_(is_encoded),
nranks_(nranks) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p : places_) { for (auto &p : places_) {
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
...@@ -52,10 +61,189 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -52,10 +61,189 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif #endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void AllReduceOpHandle::RunImplEncoded() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> outs;
int k = -1;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &local_scope =
local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto original_name =
paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var);
auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in);
auto *out = local_scope->FindVar(out_var_handles[i]->name())
->GetMutable<LoDTensor>();
outs.emplace_back(out);
if (k < 0) {
k = GetKValue(in_var_handles[i]->name());
}
}
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int dtype = -1;
size_t in_numel = 0;
size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &place = places_[i];
auto &in = *ins[i];
void *in_tensor_buf = const_cast<void *>(in.data<void>());
auto &out = *outs[i];
float *out_tensor_buf = out.data<float>();
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0);
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = boost::get<platform::CUDAPlace>(place).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
int encode_size = 2 * k * sizeof(int);
// dgc use ncclAllGather to get all the encoded data
// so the buffer need nranks.
int buf_size = nranks_ * encode_size;
auto tmp_ious_data = allocator.Allocate(buf_size);
void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
<< ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
<< ", k:" << k << ", place:" << place << ", dtype:" << dtype;
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
stream));
});
}
this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
call();
}
}
});
if (FLAGS_sync_nccl_allreduce) {
for (auto &p : places_) {
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream();
cudaError_t e_sync = cudaStreamSynchronize(stream);
if (e_sync != 0) {
LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
}
cudaError_t e_get = cudaGetLastError();
if (e_get != 0) {
LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get)
<< " errno:" << e_get;
}
}
}
}
int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0);
auto *scope = local_scopes_[0];
auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor;
}
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
bool AllReduceOpHandle::IsEncoded() {
if (!is_encoded_) {
return false;
}
auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
auto *scope = local_scopes_[0];
auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
step_var);
}
float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>();
if (static_cast<int>(count) < static_cast<int>(step)) {
VLOG(10) << "in all_reduce currentstep:" << count
<< " < rampup_begin_step:" << step
<< " so not use sparse all reduce";
return false;
}
return true;
}
#else
bool AllReduceOpHandle::IsEncoded() { return false; }
#endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
if (!IsEncoded()) {
RunImplNormal();
return;
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
RunImplEncoded();
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
}
void AllReduceOpHandle::RunImplNormal() {
platform::RecordEvent record_event(Name()); platform::RecordEvent record_event(Name());
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() { ...@@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() {
auto &lod_tensor = auto &lod_tensor =
local_scope.FindVar(in_var_handles[i]->name())->Get<LoDTensor>(); local_scope.FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
lod_tensors.emplace_back(&lod_tensor); lod_tensors.emplace_back(&lod_tensor);
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal."); "The name of input and output should be equal.");
} }
...@@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() { ...@@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() {
auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
<< ", dev_id:" << dev_id << ", dtype:" << dtype
<< ", place:" << p;
all_reduce_calls.emplace_back([=] { all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum, buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream)); comm, stream));
}); });
} }
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) { if (all_reduce_calls.size() == 1UL) {
// Do not use NCCLGroup when manage NCCL by per thread per device // Do not use NCCLGroup when manage NCCL by per thread per device
......
...@@ -28,11 +28,19 @@ namespace paddle { ...@@ -28,11 +28,19 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
constexpr char g_dgc_encoded[] = "__dgc_encoded__";
constexpr char g_dgc_k[] = "__dgc_k__";
#endif
struct AllReduceOpHandle : public OpHandleBase { struct AllReduceOpHandle : public OpHandleBase {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs); const platform::NCCLContextMap *ctxs,
bool is_encoded = false, int nranks = -1);
#else #else
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places); const std::vector<platform::Place> &places);
...@@ -50,8 +58,14 @@ struct AllReduceOpHandle : public OpHandleBase { ...@@ -50,8 +58,14 @@ struct AllReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void RunImplEncoded();
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
bool is_encoded_{false};
int nranks_{-1};
int GetKValue(const std::string &grad_name);
#endif #endif
void RunImplNormal();
bool IsEncoded();
}; };
} // namespace details } // namespace details
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB
"fuse_parameter_memory_size is up limited memory size "
"of one group parameters' gradient which is the input "
"of communication calling(e.g NCCLAllReduce). "
"The default value is 0, it means that "
"not set group according to memory_size.");
DEFINE_int32(
fuse_parameter_groups_size, 3,
"fuse_parameter_groups_size is the size of one group parameters' gradient. "
"The default value is a experimental result. If the "
"fuse_parameter_groups_size is 1, it means that the groups size is "
"the number of parameters' gradient. If the fuse_parameter_groups_size is "
"-1, it means that there are only one group. The default value is 3, it is "
"an experimental value.");
namespace paddle {
namespace framework {
namespace details {
static const char kUnKnow[] = "@UNKNOW@";
static framework::proto::VarType::Type kDefaultDtype =
framework::proto::VarType::Type::VarType_Type_BOOL;
class AllocContinuousSpaceForGradPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
// NOTE: The operator nodes should be in topology order.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
for (auto &node : topo_nodes) {
RecordParamsAndGrads(node, &params_grads);
}
if (params_grads.size() == 0) {
VLOG(10) << "Doesn't find gradients";
return;
}
std::unordered_map<std::string, ir::Node *> vars;
for (ir::Node *node : result.Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}
auto &group_grads_params =
result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
// Note: the order of params_grads may be changed by SetGroupGradsAndParams.
SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
params_grads.clear();
for (auto &group_p_g : group_grads_params) {
params_grads.insert(params_grads.begin(), group_p_g.begin(),
group_p_g.end());
}
for (auto &p_g : params_grads) {
std::swap(p_g.first, p_g.second);
}
// Set Gradients as Persistable to prevent this var becoming reusable.
auto dtype = kDefaultDtype;
for (auto &p_g : params_grads) {
// Get gradient var
auto iter = vars.find(p_g.second);
PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
iter->second->Var()->SetPersistable(true);
PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
// Get Dtype
auto ele_dtype = iter->second->Var()->GetDataType();
if (dtype == kDefaultDtype) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
"The data type should not be bool.");
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype,
"The data type of input is not consistent.");
}
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
// the kFusedGrads is used be fuse_optimizer_op_pass.
result.Set(kFusedGrads, new FusedGrads);
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
params_grads.begin()->second;
result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
"%s is duplicate in FusedVars.", fused_var_name);
fused_var_set.insert(fused_var_name);
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
fused_var_name, params_grads);
}
template <typename AttrType>
void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
if (graph->Has(attr_name)) {
VLOG(10) << attr_name << " is reset.";
graph->Erase(attr_name);
}
graph->Set(attr_name, new AttrType);
}
void SetGroupGradsAndParams(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const {
SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
}
void SetGroupAccordingToLayers(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const {
std::unordered_map<std::string, std::vector<int>> layer_params;
for (size_t i = 0; i < params_grads.size(); ++i) {
auto pos = params_grads[i].first.find_first_of(".");
if (pos == std::string::npos) {
layer_params[std::string(kUnKnow)].emplace_back(i);
} else {
layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
}
}
group_grads_params->reserve(layer_params.size());
for (size_t i = 0; i < params_grads.size(); ++i) {
auto pos = params_grads[i].first.find_first_of(".");
std::string key = kUnKnow;
if (pos != std::string::npos) {
key = params_grads[i].first.substr(0, pos);
}
auto iter = layer_params.find(key);
if (iter == layer_params.end()) continue;
group_grads_params->emplace_back();
auto &local_group_grads_params = group_grads_params->back();
for (auto &idx : iter->second) {
local_group_grads_params.emplace_back(
std::make_pair(params_grads[idx].second, params_grads[idx].first));
}
layer_params.erase(iter);
}
VLOG(10) << "SetGroupAccordingToLayers: ";
for (size_t i = 0; i < group_grads_params->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
for (auto &p_g : group_grads_params->at(i)) {
out << "(" << p_g.second << ", " << p_g.first << "), ";
}
VLOG(10) << out.str();
}
}
void SetGroupAccordingToMemorySize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const {
if (FLAGS_fuse_parameter_memory_size == 0) {
return;
}
size_t group_memory_size =
static_cast<size_t>(FLAGS_fuse_parameter_memory_size);
GroupGradsAndParams local_group_grads_params;
size_t j = 0;
while (j < group_grads_params->size()) {
local_group_grads_params.emplace_back();
auto &group_p_g = local_group_grads_params.back();
size_t local_group_memory_size = 0;
while (j < group_grads_params->size()) {
std::for_each(
group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
[&local_group_memory_size,
&var_nodes](const std::pair<std::string, std::string> &g_p) {
auto iter = var_nodes.find(g_p.second);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
g_p.second);
auto shape = iter->second->Var()->GetShape();
size_t size =
framework::SizeOfType(iter->second->Var()->GetDataType());
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
local_group_memory_size += size;
});
group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
group_grads_params->at(j).end());
++j;
if (local_group_memory_size >= group_memory_size) {
break;
}
}
}
std::swap(*group_grads_params, local_group_grads_params);
VLOG(10) << string::Sprintf(
"SetGroupAccordingToMemorySize(memory_size: %d):",
FLAGS_fuse_parameter_memory_size);
for (size_t i = 0; i < group_grads_params->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
for (auto &g_p : group_grads_params->at(i)) {
auto iter = var_nodes.find(g_p.second);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
auto shape = iter->second->Var()->GetShape();
size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
}
VLOG(10) << out.str();
}
}
void SetGroupAccordingToGroupSize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const {
if (FLAGS_fuse_parameter_groups_size == 1) {
return;
}
size_t group_size = static_cast<size_t>(FLAGS_fuse_parameter_groups_size);
if (FLAGS_fuse_parameter_groups_size == -1) {
group_size = group_grads_params->size();
}
PADDLE_ENFORCE_GT(group_size, 1);
size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
GroupGradsAndParams local_group_grads_params;
local_group_grads_params.reserve(groups);
size_t j = 0;
for (size_t i = 0; i < groups; ++i) {
local_group_grads_params.emplace_back();
auto &group_p_g = local_group_grads_params.back();
group_p_g.reserve(group_size);
while (j < group_grads_params->size()) {
group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
group_grads_params->at(j).end());
++j;
if (j % group_size == 0) break;
}
}
std::swap(*group_grads_params, local_group_grads_params);
VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size
<< "): ";
for (size_t i = 0; i < group_grads_params->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
for (auto &p_g : group_grads_params->at(i)) {
out << "(" << p_g.second << ", " << p_g.first << "), ";
}
VLOG(10) << out.str();
}
}
private:
bool IsSupportedVarType(const proto::VarType::Type &type) const {
// Current only support LOD_TENSOR.
return type == proto::VarType::LOD_TENSOR;
}
void RecordParamsAndGrads(ir::Node *node,
ParamsAndGrads *params_grads) const {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) return;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto backward_vars =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
for (size_t i = 0; i < backward_vars.size(); i += 2) {
VLOG(10) << "Trainable parameter: " << backward_vars[i]
<< ", gradient: " << backward_vars[i + 1];
params_grads->emplace_back(std::make_pair(
backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
}
} catch (boost::bad_get e) {
}
}
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, ir::Node *> &vars,
const std::string &fused_var_name,
const ParamsAndGrads &params_grads) const {
// Init Gradients and FusedVars
VLOG(10) << "Init FusedVars and Gradients.";
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has existed in scope.", fused_var_name);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
for (auto &p_g : params_grads) {
auto iter = vars.find(p_g.second);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
proto::VarType::LOD_TENSOR);
scope->Var(p_g.second)->GetMutable<LoDTensor>();
}
}
// Alloc continuous space for vars.
std::vector<std::string> grads_name;
std::vector<std::string> params_name;
grads_name.reserve(params_grads.size());
params_name.reserve(params_grads.size());
for (auto &p_g : params_grads) {
params_name.emplace_back(p_g.first);
grads_name.emplace_back(p_g.second);
}
framework::ProgramDesc program_desc;
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
program_desc.MutableBlock(0));
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}
};
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(alloc_continuous_space_for_grad_pass,
paddle::framework::details::AllocContinuousSpaceForGradPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
...@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() { ...@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() {
if (places_.size() == 1) return; if (places_.size() == 1) return;
// The input and output may have dummy vars. // The input and output may have dummy vars.
VarHandle *in_var_handle; auto in_var_handles = DynamicCast<VarHandle>(inputs_);
{
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
in_var_handle = in_var_handles[0];
}
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
out_var_handles.size(), places_.size(), out_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); "The number of output should equal to the number of places.");
VarHandle *in_var_handle = in_var_handles[0];
WaitInputVarGenerated(); WaitInputVarGenerated();
std::vector<const Scope *> var_scopes; std::vector<const Scope *> var_scopes;
......
...@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; }; bool IsMultiDeviceTransfer() override { return true; };
protected: protected:
void RunImpl() override; void RunImpl() override;
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <memory> #include <memory>
#include <utility>
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
...@@ -45,12 +45,27 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -45,12 +45,27 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
public: public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) { : ir::PassBuilder(), strategy_(strategy) {
// Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
if (strategy_.enable_sequential_execution_) { if (strategy_.enable_sequential_execution_) {
VLOG(10) << "Add sequential_execution_pass";
AppendPass("sequential_execution_pass"); AppendPass("sequential_execution_pass");
} }
// Add op fusion.
if (strategy.sync_batch_norm_) {
AppendPass("sync_batch_norm_pass");
}
// Add op fusion. // Add op fusion.
if (strategy.fuse_relu_depthwise_conv_) { if (strategy.fuse_relu_depthwise_conv_) {
VLOG(10) << "Add fuse_relu_depthwise_conv_pass";
AppendPass("fuse_relu_depthwise_conv_pass"); AppendPass("fuse_relu_depthwise_conv_pass");
} }
...@@ -62,29 +77,50 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -62,29 +77,50 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Add automatically inplace. // Add automatically inplace.
if (strategy_.enable_inplace_) { if (strategy_.enable_inplace_) {
VLOG(10) << "Add inplace_pass";
AppendPass("inplace_pass"); AppendPass("inplace_pass");
} }
if (strategy_.fuse_elewise_add_act_ops_) {
VLOG(10) << "Add fuse_elewise_add_act_pass";
AppendPass("fuse_elewise_add_act_pass");
}
// for single card training, fuse_all_reduce_ops is unnecessary.
// alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
if (strategy_.fuse_all_reduce_ops_) {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
}
if (strategy_.fuse_all_optimizer_ops_) {
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce ||
strategy_.is_distribution_) {
VLOG(3)
<< "Currently, fuse_all_optimizer_ops only works under AllReduce "
"mode.";
strategy_.fuse_all_optimizer_ops_ = false;
} else {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused.
VLOG(10) << "Add fuse_adam_op_pass";
AppendPass("fuse_adam_op_pass");
VLOG(10) << "Add fuse_sgd_op_pass";
AppendPass("fuse_sgd_op_pass");
}
}
// Add a graph viz pass to record a graph. // Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass"); auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf( const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path)); viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
} }
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
// Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path",
new std::string(graph_path));
}
}
CollectiveContext *context = CollectiveContext::GetInstance(); CollectiveContext *context = CollectiveContext::GetInstance();
context->endpoints_ = strategy_.trainers_endpoints_; context->endpoints_ = strategy_.trainers_endpoints_;
context->trainer_id_ = strategy_.trainer_id_; context->trainer_id_ = strategy_.trainer_id_;
...@@ -101,11 +137,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -101,11 +137,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// the de-fact IR, any reuse on Graph is meaningless. // the de-fact IR, any reuse on Graph is meaningless.
// A side-effect of that, memory optimize cannot forsee the fetched vars // A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface. // , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) { if (strategy_.memory_optimize_) {
auto memory_optimize_pass = AppendPass("memory_optimize_pass"); VLOG(10) << "Add memory_optimize_pass";
AppendPass("memory_optimize_pass");
} }
AppendMultiDevPass(strategy); AppendMultiDevPass(strategy_);
if (strategy_.fuse_all_reduce_ops_) {
// NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
// first, if the number is zero, fuse_all_reduce_ops will do nothing.
VLOG(10) << "Add fuse_all_reduce_op_pass";
AppendPass("fuse_all_reduce_op_pass");
}
// Add a graph print pass to record a graph with device info. // Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
...@@ -122,28 +166,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -122,28 +166,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Verify that the graph is correct for multi-device executor. // Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass"); AppendPass("multi_devices_check_pass");
if (SeqOnlyAllReduceOps(strategy)) { if (VLOG_IS_ON(2)) {
AppendPass("all_reduce_deps_pass");
}
if (SeqOnlyAllReduceOps(strategy_)) {
VLOG(10) << "Add all_reduce_deps_pass";
AppendPass("all_reduce_deps_pass"); AppendPass("all_reduce_deps_pass");
} }
if (strategy_.remove_unnecessary_lock_) { if (strategy_.remove_unnecessary_lock_) {
VLOG(10) << "Add modify_op_lock_and_record_event_pass";
AppendPass("modify_op_lock_and_record_event_pass"); AppendPass("modify_op_lock_and_record_event_pass");
} }
} }
// Convert graph to run on multi-devices. // Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) { void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass; ir::Pass *multi_devices_pass = nullptr;
if (strategy_.is_distribution_) { if (strategy.is_distribution_) {
VLOG(3) << "multi device parameter server mode"; VLOG(10) << "Add dist_multi_devices_pass";
multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else { } else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
VLOG(3) << "multi devices collective mode with allreduce"; VLOG(10) << "Add all_reduce_mode_multi_devices_pass";
multi_devices_pass = multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get(); AppendPass("all_reduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
VLOG(3) << "multi deivces collective mode with reduce"; VLOG(10) << "Add reduce_mode_multi_devices_pass";
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else { } else {
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW("Unknown reduce strategy.");
...@@ -173,15 +223,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { ...@@ -173,15 +223,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0; return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
} }
std::unique_ptr<ir::Graph> BuildStrategy::Apply( ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
std::unique_ptr<ir::Graph> graph, const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::string &loss_var_name,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const size_t &nranks, const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { const bool use_cuda,
platform::NCCLContextMap *nccl_ctxs) const {
#else #else
const bool use_cuda) const { const bool use_cuda) const {
#endif #endif
// Create a default one if not finalized by user. // Create a default one if not finalized by user.
CreatePassesFromStrategy(false); CreatePassesFromStrategy(false);
...@@ -200,9 +251,31 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -200,9 +251,31 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs"); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif #endif
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
pass->Type() == "fuse_adam_op_pass" ||
pass->Type() == "fuse_sgd_op_pass" ||
pass->Type() == "fuse_all_reduce_op_pass") {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
if (pass->Type() == "fuse_all_reduce_op_pass") {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif
}
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
} else if (pass->Type() == "sequential_execution_pass") { } else if (pass->Type() == "sequential_execution_pass") {
LOG(INFO) << "set enable_sequential_execution:" LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_; << enable_sequential_execution_;
...@@ -217,7 +290,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -217,7 +290,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
} }
} }
VLOG(3) << "Start Apply Pass " << pass->Type(); VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph)); graph = pass->Apply(graph);
VLOG(3) << "Finish Apply Pass " << pass->Type(); VLOG(3) << "Finish Apply Pass " << pass->Type();
} }
return graph; return graph;
...@@ -227,12 +300,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -227,12 +300,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(sync_batch_norm_pass);
USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_relu_depthwise_conv_pass);
USE_PASS(fuse_elewise_add_act_pass); USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass); USE_PASS(multi_batch_merge_pass);
USE_PASS(reduce_mode_multi_devices_pass); USE_PASS(reduce_mode_multi_devices_pass);
USE_PASS(allreduce_mode_multi_devices_pass); USE_PASS(all_reduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass); USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
...@@ -242,4 +316,8 @@ USE_PASS(all_reduce_deps_pass); ...@@ -242,4 +316,8 @@ USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(inplace_pass); USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass); USE_PASS(lock_free_optimize_pass);
USE_PASS(alloc_continuous_space_for_grad_pass);
USE_PASS(graph_to_program_pass); USE_PASS(graph_to_program_pass);
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
USE_PASS(fuse_all_reduce_op_pass);
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -75,8 +75,14 @@ struct BuildStrategy { ...@@ -75,8 +75,14 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool fuse_all_optimizer_ops_{false};
bool fuse_all_reduce_ops_{false};
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool sync_batch_norm_{false};
bool memory_optimize_{true}; bool memory_optimize_{true};
// TODO(dzhwinter): // TODO(dzhwinter):
// make enable_inplace, memory_optimize_ // make enable_inplace, memory_optimize_
...@@ -115,16 +121,15 @@ struct BuildStrategy { ...@@ -115,16 +121,15 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph, ir::Graph *Apply(ir::Graph *graph, const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::string &loss_var_name,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const std::vector<Scope *> &local_scopes, const size_t &nranks,
const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, const bool use_cuda,
platform::NCCLContextMap *nccl_ctxs) const; platform::NCCLContextMap *nccl_ctxs) const;
#else #else
const bool use_cuda) const; const bool use_cuda) const;
#endif #endif
// If set true, ParallelExecutor would build the main_program into multiple // If set true, ParallelExecutor would build the main_program into multiple
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h"
namespace paddle {
namespace framework {
namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle::DataBalanceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) {
for (auto &p : places_) {
this->SetDeviceContext(p, ctxs->DevCtx(p));
}
}
}
#else
DataBalanceOpHandle::DataBalanceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
std::string DataBalanceOpHandle::Name() const { return "data balance"; }
std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
const std::vector<int> &device_sizes) {
int device_num = device_sizes.size();
int total_size = 0;
int empty_num = 0;
std::vector<std::array<int, 2>> size_device_vec;
size_device_vec.reserve(device_num);
for (int i = 0; i < device_num; ++i) {
if (device_sizes[i] == 0) {
++empty_num;
}
total_size += device_sizes[i];
size_device_vec.push_back({{device_sizes[i], i}});
}
std::vector<std::array<int, 3>> res;
if (empty_num == 0) {
// No need to do data balance.
return res;
}
if (total_size < device_num) {
// No enough data.
PADDLE_THROW_EOF();
}
std::sort(size_device_vec.begin(), size_device_vec.end(),
[](const std::array<int, 2> &a, const std::array<int, 2> &b) {
return a[0] > b[0];
});
int expected_device_size = total_size / device_num;
int src_idx = 0;
for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
if (size_device_vec[src_idx][0] <= expected_device_size) {
++src_idx;
PADDLE_ENFORCE_LT(
src_idx, device_num - empty_num,
"In current srategy an empty tensor should not be copy source.");
}
size_device_vec[src_idx][0] -= expected_device_size;
size_device_vec[dst_idx][0] += expected_device_size;
res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
expected_device_size}});
}
return res;
}
void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
int data_num = in_var_handles.size() / places_.size();
WaitInputVarGenerated();
std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
std::vector<int> device_sizes;
for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
int place_idx = i / data_num;
int data_idx = i % data_num;
auto *local_scope =
local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
auto *tensor = tensor_var->GetMutable<LoDTensor>();
lod_tensors[data_idx].push_back(tensor);
int ins_size =
tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
if (data_idx == 0) {
device_sizes.emplace_back(ins_size);
} else {
PADDLE_ENFORCE_EQ(
ins_size, device_sizes.at(place_idx),
"All data on the same device shall have the same batch size.");
}
}
const auto &balance_plan = GetBalancePlan(device_sizes);
for (const auto &trans : balance_plan) {
for (int data_idx = 0; data_idx < data_num; ++data_idx) {
LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
int trans_ins_size = trans[2];
LoD src_lod = src_tensor->lod();
int src_ins_size =
src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
int cut_point = src_ins_size - trans_ins_size;
if (!src_lod.empty()) {
for (auto &level : src_lod) {
cut_point = level[cut_point];
}
}
TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
dst_tensor->place(), dst_tensor);
src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
if (!src_lod.empty()) {
dst_tensor->set_lod(SliceInLevel(
src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
src_tensor->set_lod(
SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
}
}
}
}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -22,14 +22,9 @@ ...@@ -22,14 +22,9 @@
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
"variables would be deleted.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -175,12 +170,10 @@ static OpToVarNameSetMap ShrinkGCVars( ...@@ -175,12 +170,10 @@ static OpToVarNameSetMap ShrinkGCVars(
class EagerDeletionPass : public ir::Pass { class EagerDeletionPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph *graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
}; };
std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = auto &ref_cnts =
Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount); Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
PADDLE_ENFORCE(ref_cnts.empty(), PADDLE_ENFORCE(ref_cnts.empty(),
...@@ -206,8 +199,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( ...@@ -206,8 +199,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
} }
} }
op_vars_map = ShrinkGCVars(op_vars_map, vars, places, double memory_fraction = framework::GetEagerDeletionMemoryFraction();
FLAGS_memory_fraction_of_eager_deletion);
op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction);
for (auto &pair : op_vars_map) { for (auto &pair : op_vars_map) {
auto *op = pair.first; auto *op = pair.first;
...@@ -239,13 +233,12 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( ...@@ -239,13 +233,12 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
eager_deletion_op->AddOutput(dummy_leaf); eager_deletion_op->AddOutput(dummy_leaf);
} }
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
<< FLAGS_memory_fraction_of_eager_deletion;
VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
auto while_op_eager_deletion_pass = auto while_op_eager_deletion_pass =
ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
return while_op_eager_deletion_pass->Apply(std::move(graph)); while_op_eager_deletion_pass->Apply(graph);
} }
} // namespace details } // namespace details
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace framework {
namespace details {
class EarlyDeleteOpHandle : public OpHandleBase {
public:
EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
const platform::Place& place,
const std::vector<std::string>& names,
GarbageCollector* gc)
: OpHandleBase(node),
scope_(scope),
place_(place),
names_(names),
gc_(gc) {
#ifdef PADDLE_WITH_CUDA
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(place);
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
}
#endif
}
~EarlyDeleteOpHandle() {
#ifdef PADDLE_WITH_CUDA
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventDestroy(event_));
}
#endif
}
std::string Name() const override { return "early_delete"; }
protected:
void RunImpl() override {
std::vector<std::shared_ptr<memory::Allocation>> tensors;
auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
for (auto& var_name : names_) {
auto* var = local_scope->FindVar(var_name);
PADDLE_ENFORCE(var != nullptr,
string::Sprintf("Local Scope not has var %s", var_name));
if (var->IsType<LoDTensor>()) {
tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
} else if (var->IsType<SelectedRows>()) {
tensors.emplace_back(var->GetMutable<SelectedRows>()
->mutable_value()
->MoveMemoryHolder());
} else if (var->IsType<LoDTensorArray>()) {
LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
for (auto& tensor : *tensor_array) {
tensors.emplace_back(tensor.MoveMemoryHolder());
}
}
}
if (!tensors.empty()) {
ClearTensors(tensors);
}
}
private:
void ClearTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
if (platform::is_cpu_place(place_)) {
ClearCPUTensors(tensors);
} else {
ClearGPUTensors(tensors);
}
}
void ClearCPUTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
if (gc != nullptr) {
gc->Add(tensors);
}
}
void ClearGPUTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
#ifdef PADDLE_WITH_CUDA
auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
if (gc != nullptr) {
auto compute_stream = dev_ctx_->stream();
auto callback_stream = gc->stream();
auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
};
gc_->Add(tensors, callback_func);
} else {
gc_->Add(tensors);
}
}
bool IsStreamGarabageCollector() const {
return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
#endif
}
const Scope* scope_;
const platform::Place place_;
std::vector<std::string> names_;
GarbageCollector* gc_;
#ifdef PADDLE_WITH_CUDA
platform::CUDADeviceContext* dev_ctx_;
cudaEvent_t event_;
#endif
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( ...@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
graph_(graph), graph_(graph),
fetch_ctxs_(places),
pool_(strategy.num_threads_), pool_(strategy.num_threads_),
prepare_pool_(1), // add one more thread for generate op_deps // add one more thread for generate op_deps
fetch_ctxs_(places) { prepare_pool_(1) {
for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) { for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
int dep = static_cast<int>(op->NotReadyInputSize()); int dep = static_cast<int>(op->NotReadyInputSize());
op_deps_.emplace(op, dep); op_deps_.emplace(op, dep);
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
#pragma once #pragma once
#include <ThreadPool.h> #include <ThreadPool.h>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
...@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
const ir::Graph &Graph() const override; const ir::Graph &Graph() const override;
private: private:
// Note(zcd): the ThreadPool should be placed last so that ThreadPool should
// be destroyed first.
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
...@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::unordered_map<OpHandleBase *, int> op_deps_; std::unordered_map<OpHandleBase *, int> op_deps_;
std::vector<OpHandleBase *> bootstrap_ops_; std::vector<OpHandleBase *> bootstrap_ops_;
::ThreadPool pool_;
::ThreadPool prepare_pool_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
std::atomic<int> remaining_; std::atomic<int> remaining_;
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
::ThreadPool pool_;
::ThreadPool prepare_pool_;
void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q); const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
void PrepareAtomicOpDeps(); void PrepareAtomicOpDeps();
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -12,40 +12,55 @@ ...@@ -12,40 +12,55 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" #include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
#include <string>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
FetchBarrierOpHandle::FetchBarrierOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
// fetch_barrier op always run on place0, but output on all places.
: OpHandleBase(node),
op_(framework::OpRegistry::CreateOp(*node->Op())),
local_scopes_(local_scopes),
places_(places),
run_scope_(local_scopes[0]),
place_(places[0]) {
for (auto &p : places) {
this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p));
}
}
void FuseVarsOpHandle::RunImpl() { bool FetchBarrierOpHandle::IsMultiDeviceTransfer() {
WaitInputVarGenerated(place_); // override IsMultiDeviceTransfer to return true
return true;
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); }
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto out_var_handle = out_var_handles[0]; void FetchBarrierOpHandle::RunImpl() {
auto out_var = scope->Var(out_var_handle->name()); WaitInputVarGenerated(place_);
auto out_tensor = out_var->GetMutable<LoDTensor>(); auto run_func = [this]() {
out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); op_->Run(*run_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
};
int64_t s = 0; if (is_lock_and_record_event_free_) {
for (size_t i = 1; i < out_var_handles.size(); ++i) { run_func();
auto out_name = out_var_handles[i]->name(); } else {
auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>(); this->RunAndRecordEvent(run_func);
auto numel = this->inputs_numel_.at(out_name);
out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
s += numel;
} }
this->RunAndRecordEvent([] {});
} }
std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait =
in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
return need_wait;
}
std::string FetchBarrierOpHandle::Name() const { return op_->Type(); }
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
#pragma once #pragma once
#include <map> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -28,38 +28,34 @@ namespace paddle { ...@@ -28,38 +28,34 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct FuseVarsOpHandle : public OpHandleBase { // **NOTE**: fetch_barrier op is special it outputs all recved variables on
// all places if there are multiple places, must init with
// multiple dev_ctxes_ !!!!
struct FetchBarrierOpHandle : public OpHandleBase {
public: public:
FuseVarsOpHandle(ir::Node *node, Scope *local_scope, FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const platform::Place &place, const std::vector<platform::Place> &places);
const std::unordered_map<std::string, int64_t> &inputs_numel,
const proto::VarType::Type var_type)
: OpHandleBase(node),
local_scope_(local_scope),
place_(place),
inputs_numel_(inputs_numel),
type_(var_type) {
total_numel_ = 0;
for (auto in_numel : inputs_numel) {
PADDLE_ENFORCE_GT(in_numel.second, 0);
total_numel_ += in_numel.second;
}
}
std::string Name() const override; bool IsMultiDeviceTransfer() override;
bool IsMultiDeviceTransfer() override { return false; }; std::string Name() const override;
protected: protected:
void RunImpl() override; void RunImpl() override;
bool NeedWait(VarHandleBase *in_var) override;
private: private:
Scope *local_scope_; std::unique_ptr<OperatorBase> op_;
const platform::Place place_; std::vector<Scope *> local_scopes_;
const std::unordered_map<std::string, int64_t> inputs_numel_; std::vector<platform::Place> places_;
const proto::VarType::Type type_; Scope *run_scope_;
int64_t total_numel_; platform::Place place_;
bool is_lock_and_record_event_free_{false};
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { ...@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
} }
} }
bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
std::string FetchOpHandle::Name() const { return "Fetch"; } std::string FetchOpHandle::Name() const { return "Fetch"; }
} // namespace details } // namespace details
......
...@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase { ...@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override;
protected: protected:
void RunImpl() override; void RunImpl() override;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_adam_op_pass.h"
#include <algorithm>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
}
void FuseAdamOpPass::FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
adam_ops, graph);
FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
adam_ops, graph);
}
void FuseAdamOpPass::FuseAdamOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
// Check attributions
// NOTE: If new attribution is added, the following code maybe need change.
int op_role = boost::get<int>(
adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
for (auto &adam_op : adam_ops) {
PADDLE_ENFORCE_EQ(beta1,
boost::get<float>(adam_op->Op()->GetAttr("beta1")));
PADDLE_ENFORCE_EQ(beta2,
boost::get<float>(adam_op->Op()->GetAttr("beta2")));
PADDLE_ENFORCE_EQ(epsilon,
boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
PADDLE_ENFORCE_EQ(lazy_mode,
boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
boost::get<int64_t>(adam_op->Op()->GetAttr(
"min_row_size_to_use_multithread")));
PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())));
}
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
VLOG(10) << "Insert adam to graph ";
OpDesc adam_desc(adam_ops[0]->Op()->Block());
adam_desc.SetType("adam");
adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetAttr("beta1", beta1);
adam_desc.SetAttr("beta2", beta2);
adam_desc.SetAttr("epsilon", epsilon);
adam_desc.SetAttr("lazy_mode", lazy_mode);
adam_desc.SetAttr("min_row_size_to_use_multithread",
min_row_size_to_use_multithread);
adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto adam_node = graph->CreateOpNode(&adam_desc);
InserInputAndOutputForOptOps(adam_ops, adam_node);
}
void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name,
const std::string &fused_var_name,
const std::vector<ir::Node *> &adam_ops,
ir::Graph *graph) const {
PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
const std::string scale_op_name = "scale";
// Get the scale_ops of dealing the adam's beta var.
std::vector<ir::Node *> scale_ops;
scale_ops.reserve(beta_name.size());
for (size_t i = 0; i < adam_ops.size(); ++i) {
auto &beta_1_pow_name = beta_name[i];
auto beta_pow_iter = std::find_if(
adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
[&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name;
});
PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
auto beta_pow_node = *beta_pow_iter;
auto scale_op_iter = std::find_if(
beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
[&scale_op_name](ir::Node *op_node) -> bool {
return op_node->Op() && op_node->Op()->Type() == scale_op_name;
});
PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
scale_ops.emplace_back(*scale_op_iter);
}
PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
// Check attributions
// NOTE: If new attribution is added, the following code maybe need change.
int op_role = boost::get<int>(
scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
bool bias_after_scale =
boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
for (auto &scale_op : scale_ops) {
PADDLE_ENFORCE_EQ(scale,
boost::get<float>(scale_op->Op()->GetAttr("scale")));
PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias")));
PADDLE_ENFORCE_EQ(
bias_after_scale,
boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())));
}
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
VLOG(10) << "Insert fused scale to graph.";
OpDesc scale_desc(scale_ops[0]->Op()->Block());
scale_desc.SetType("scale");
scale_desc.SetInput("X", {fused_var_name});
scale_desc.SetOutput("Out", {fused_var_name});
scale_desc.SetAttr("scale", scale);
scale_desc.SetAttr("bias", bias);
scale_desc.SetAttr("bias_after_scale", bias_after_scale);
scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto scale_node = graph->CreateOpNode(&scale_desc);
for (auto scale_op : scale_ops) {
// set inputs
scale_node->inputs.insert(scale_node->inputs.begin(),
scale_op->inputs.begin(), scale_op->inputs.end());
for (auto &input : scale_op->inputs) {
std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
scale_node);
}
// set outputs
scale_node->outputs.insert(scale_node->outputs.begin(),
scale_op->outputs.begin(),
scale_op->outputs.end());
for (auto &output : scale_op->outputs) {
std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
scale_node);
}
}
// Delete scale_ops
for (auto &scale_op : scale_ops) {
graph->RemoveNode(scale_op);
}
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseAdamOpPass : public FuseOptimizerOpPass {
private:
virtual const std::string GetOpType() const;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
// Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow"
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
void FuseAdamOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
void FuseScaleOps(const std::vector<std::string> &aux_var_set,
const std::string &fused_var_name,
const std::vector<ir::Node *> &adam_ops,
ir::Graph *graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
class FuseAllReduceOpPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_ctxs = &Get<platform::NCCLContextMap>(kNCCLCtxs);
#endif
std::unordered_set<std::string> grads;
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
size_t num_of_all_reduce = params_grads.size();
grads.reserve(num_of_all_reduce);
for (auto p_g : params_grads) {
grads.insert(p_g.second);
}
size_t num_place = places.size();
std::unordered_map<std::string, ir::Node *> all_reduce_ops;
all_reduce_ops.reserve(grads.size());
for (auto &node : result.Nodes()) {
if (node->IsOp()) {
PADDLE_ENFORCE(node->IsWrappedBy<OpHandleBase>());
auto *all_reduce_op_handle =
dynamic_cast<AllReduceOpHandle *>(&node->Wrapper<OpHandleBase>());
if (all_reduce_op_handle) {
auto inputs = DynamicCast<VarHandle>(all_reduce_op_handle->Inputs());
PADDLE_ENFORCE_EQ(inputs.size(), num_place);
// The inputs' name should be the same.
auto &grad_name = inputs[0]->name();
for (size_t i = 1; i < inputs.size(); ++i) {
PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
"The input name should be the same.");
}
PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
all_reduce_ops.emplace(grad_name, node);
}
}
}
VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
if (all_reduce_ops.size() == 0) {
return;
}
PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
"The number of all_reduce OpHandle is not equal to the "
"number of grads. Maybe some gradients are sparse type, "
"it is not supported currently.");
VLOG(10) << "Insert fused_all_reduce";
auto &group_grads_params =
graph->Get<GroupGradsAndParams>(kGroupGradsAndParams);
for (auto &group_g_p : group_grads_params) {
size_t group_size = group_g_p.size();
PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
std::vector<ir::Node *> group_all_reduce_ops;
group_all_reduce_ops.reserve(group_size);
for (auto &g_p : group_g_p) {
group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first));
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, nccl_ctxs, &result);
#else
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, &result);
#endif
}
}
void InsertFusedAllReduce(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const size_t num_of_all_reduce,
const std::vector<ir::Node *> &all_reduce_ops,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs,
#endif
ir::Graph *result) const {
std::vector<VarHandleBase *> inputs;
std::vector<VarHandleBase *> outputs;
for (auto &op : all_reduce_ops) {
auto &op_handle = op->Wrapper<OpHandleBase>();
inputs.insert(inputs.end(), op_handle.Inputs().begin(),
op_handle.Inputs().end());
// Remove output
for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(),
[&op_handle](VarHandleBase *var_handle) {
var_handle->RemoveOutput(&op_handle, op_handle.Node());
});
outputs.insert(outputs.end(), op_handle.Outputs().begin(),
op_handle.Outputs().end());
// Remove Input
for_each(
op_handle.Outputs().begin(), op_handle.Outputs().end(),
[](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); });
result->RemoveNode(op_handle.Node());
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, nccl_ctxs, result);
#else
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, result);
#endif
}
private:
void CreateFusedAllReduceOp(const std::vector<VarHandleBase *> &inputs,
const std::vector<VarHandleBase *> &outputs,
const size_t num_of_all_reduce,
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs,
#endif
ir::Graph *result) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *op_handle = new FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce, nccl_ctxs);
#else
auto *op_handle = new FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce);
#endif
for (auto in : inputs) {
op_handle->AddInput(in);
}
for (auto out : outputs) {
op_handle->AddOutput(out);
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (!nccl_ctxs) {
SetCommunicationContext(places, op_handle);
}
#else
SetCommunicationContext(places, op_handle);
#endif
}
void SetCommunicationContext(const std::vector<platform::Place> &places,
FusedAllReduceOpHandle *op_handle) const {
for (size_t i = 0; i < places.size(); ++i) {
op_handle->SetDeviceContext(
places[i], platform::DeviceContextPool::Instance().Get(places[i]));
}
}
};
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_all_reduce_op_pass,
paddle::framework::details::FuseAllReduceOpPass);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include <algorithm>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
const std::string fuse_op_type = GetOpType();
const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
// Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
std::vector<ir::Node *> opt_ops;
for (auto &node : topo_nodes) {
GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops,
&aux_var_set);
}
VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
if (opt_ops.size() == 0) {
return;
}
if (result.Has(kFusedOptType)) {
VLOG(10)
<< "Currently only support fusing one type optimizer op. Has fused "
<< result.Get<FusedOptType>(kFusedOptType);
return;
} else {
result.Set(kFusedOptType, new FusedOptType);
}
result.Get<FusedOptType>(kFusedOptType) = fuse_op_type;
// Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
// initialized in scopes before execution.
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size() + 1);
auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
const std::string prefix(kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique.
for (auto &var_name : aux_var_names) {
auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
aux_var_set[var_name][0];
VLOG(10) << fused_var_name;
fused_vars_name.emplace(var_name, fused_var_name);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
fused_var_set.insert(fused_var_name);
}
// Step 3: Get the fused Gradient's name
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
if (!result.Has(kFusedGrads)) {
PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before this "
"pass.");
}
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
auto &fused_vars = result.Get<FusedVars>(kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name.emplace("Grad", fused_grad);
// Step 4: Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
"The size of params_grads and aux_var_set are not equal.");
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
// Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name);
// Step 6: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
// Step 7: Remove optimizer Ops
for (auto &opt_op : opt_ops) {
graph->RemoveNode(opt_op);
}
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const {
VLOG(10) << "Init FusedVars.";
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
for (auto &var_name : aux_var_names) {
auto fused_var_name = fused_vars_name.at(var_name);
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}
ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(aux_var_set.at(var_name),
fused_vars_name.at(var_name), true,
global_block);
}
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block->AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
std::vector<ir::Node *> *ops) const {
PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0));
auto &param_vec = aux_vars_set->at("Param");
std::vector<size_t> param_sort_idx;
param_sort_idx.reserve(param_vec.size());
for (auto &p_g : params_grads) {
auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first);
PADDLE_ENFORCE(iter != param_vec.end());
auto idx = std::distance(param_vec.begin(), iter);
param_sort_idx.emplace_back(idx);
}
for (auto &aux_vars : *aux_vars_set) {
std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size());
for (size_t i = 0; i < aux_vars.second.size(); ++i) {
sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i]));
}
std::swap(aux_vars.second, sorted_vars);
std::stringstream out;
for (auto &var_name : aux_vars.second) {
out << var_name << " ";
}
VLOG(10) << aux_vars.first << ": " << out.str();
}
std::vector<ir::Node *> sorted_ops;
sorted_ops.reserve(ops->size());
for (size_t i = 0; i < ops->size(); ++i) {
sorted_ops.emplace_back(ops->at(param_sort_idx[i]));
}
std::swap(*ops, sorted_ops);
}
void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const {
if (node->Op()->Type() != op_type) return;
for (auto &var_n : aux_vars_name) {
auto arg_names = node->Op()->Input(var_n);
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
(*aux_args_name)[var_n].emplace_back(arg_names[0]);
VLOG(10) << var_n << ", " << arg_names[0];
}
ops->emplace_back(node);
}
void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &args, const std::string &out_arg,
bool copy_data, BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", args);
op_desc->SetOutput("Output", args);
op_desc->SetOutput("FusedOutput", {out_arg});
op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", true);
}
void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const {
std::unordered_set<ir::Node *> inputs;
std::unordered_set<ir::Node *> outputs;
for (auto opt_op : opt_ops) {
// set inputs
inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
for (auto &input : opt_op->inputs) {
replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node);
}
// set outputs
outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
for (auto &output : opt_op->outputs) {
replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node);
}
}
opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(),
inputs.end());
opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(),
outputs.end());
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseOptimizerOpPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override;
protected:
virtual void SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
std::vector<ir::Node *> *ops) const;
void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops,
ir::Node *opt_node) const;
private:
virtual const std::string GetOpType() const = 0;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
void GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const;
void AppendAllocContinuousSpace(const std::vector<std::string> &args,
const std::string &out_arg, bool copy_data,
BlockDesc *global_block) const;
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name)
const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h"
#include <algorithm>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
return {"Param"};
}
void FuseSgdOpPass::FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph);
}
void FuseSgdOpPass::FuseSgdOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
int op_role = boost::get<int>(
sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
VLOG(10) << "Insert sgd to graph ";
// Add fused scale
OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
Sgd_desc.SetType("sgd");
Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
// NOTE: multi_devices_pass requires that every op should have a role.
Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto sgd_node = graph->CreateOpNode(&Sgd_desc);
InserInputAndOutputForOptOps(sgd_ops, sgd_node);
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseSgdOpPass : public FuseOptimizerOpPass {
private:
virtual const std::string GetOpType() const;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
// Fuse Sgd Ops
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
void FuseSgdOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(skip_fused_all_reduce_check, false, "");
namespace paddle {
namespace framework {
namespace details {
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
static size_t Alignment(size_t size, const platform::Place &place) {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
const platform::NCCLContextMap *ctxs)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
num_of_all_reduce_(num_of_all_reduce),
nccl_ctxs_(ctxs) {
if (nccl_ctxs_) {
for (auto &p : places_) {
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#else
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
num_of_all_reduce_(num_of_all_reduce) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#endif
void FusedAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
VLOG(4) << this->DebugString();
WaitInputVarGenerated();
// The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
// The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
GradientAndLoDTensor grads_tensor;
grads_tensor.resize(place_num);
int64_t numel = -1;
auto dtype = static_cast<framework::proto::VarType::Type>(0);
for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
auto &g_tensor = grads_tensor.at(scope_idx);
g_tensor.reserve(num_of_all_reduce_);
GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor);
int64_t element_num = 0;
framework::proto::VarType::Type ele_dtype =
static_cast<framework::proto::VarType::Type>(0);
GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
if (numel == -1) {
numel = element_num;
}
if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype,
static_cast<framework::proto::VarType::Type>(0));
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
// Check whether the address space is contiguous.
std::sort(
g_tensor.begin(), g_tensor.end(),
[](const std::pair<std::string, const LoDTensor *> &grad1,
const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
return grad1.second->data<void>() < grad2.second->data<void>();
});
size_t size_of_dtype = framework::SizeOfType(dtype);
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data<void>();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data<void>();
VLOG(10) << string::Sprintf(
"Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
"input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address,
"The address is not consistent.");
}
}
if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first);
}
}
}
std::vector<const void *> lod_tensor_data;
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
lod_tensor_data.emplace_back(data);
}
if (platform::is_gpu_place(places_[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(nccl_dtype),
ncclSum, comm, stream));
});
}
this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
call();
}
}
});
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else {
// Special handle CPU only Operator's gradient. Like CRF
auto grad_name = grads_tensor.at(0).at(0).first;
auto &trg = *this->local_scopes_[0]
->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(grad_name)
->GetMutable<framework::LoDTensor>();
// Reduce All data to trg in CPU
ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
VisitDataType(trg.type(), func);
for (size_t i = 1; i < local_scopes_.size(); ++i) {
auto &scope =
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i];
auto *var = scope.FindVar(grad_name);
auto *dev_ctx = dev_ctxes_.at(p);
size_t size = numel * SizeOfType(trg.type());
RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
platform::CPUPlace cpu_place;
memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
});
}
}
}
void FusedAllReduceOpHandle::GetGradLoDTensor(
const size_t &scope_idx, const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles,
std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
auto *local_scope =
local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get<Scope *>();
size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
}
}
void FusedAllReduceOpHandle::GetDTypeAndNumel(
const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
proto::VarType::Type *dtype, int64_t *numel) const {
*numel = 0;
size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype
auto ele_type = grad_tensor.at(i).second->type();
if (i == 0) {
*dtype = ele_type;
size_of_dtype = framework::SizeOfType(ele_type);
}
PADDLE_ENFORCE_EQ(ele_type, *dtype);
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
// Alignment(len)
*numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
}
std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -27,31 +28,47 @@ namespace paddle { ...@@ -27,31 +28,47 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct DataBalanceOpHandle : public OpHandleBase { struct FusedAllReduceOpHandle : public OpHandleBase {
public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places, const std::vector<Scope *> &local_scopes,
const platform::NCCLContextMap *ctxs); const std::vector<platform::Place> &places,
const size_t num_of_all_reduce,
const platform::NCCLContextMap *ctxs);
#else #else
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places); const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const size_t num_of_all_reduce);
#endif #endif
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; }; // Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return true; };
protected: protected:
void RunImpl() override; void RunImpl() override;
private: private:
// std::vector<(src_dev_id, dst_dev_id, trans_size)> std::vector<Scope *> local_scopes_;
std::vector<std::array<int, 3>> GetBalancePlan( std::vector<platform::Place> places_;
const std::vector<int> &batch_size_per_device); size_t num_of_all_reduce_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs_;
#endif
// Check the dtype of the input
void GetDTypeAndNumel(
const std::vector<std::pair<std::string, const LoDTensor *>> &g_tensor,
proto::VarType::Type *dtype, int64_t *total_num) const;
const std::vector<Scope *> local_scopes_; // Get gradient's name and LoDTensor
const std::vector<platform::Place> places_; void GetGradLoDTensor(const size_t &scope_idx,
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles,
std::vector<std::pair<std::string, const LoDTensor *>>
*grad_tensor) const;
}; };
} // namespace details } // namespace details
......
...@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker { ...@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
class DummyVarTypeInference : public VarTypeInference { class DummyVarTypeInference : public VarTypeInference {
public: public:
void operator()(const OpDesc& op_desc, BlockDesc* block) const override { void operator()(framework::InferVarTypeContext* ctx) const override {
auto& inputs = op_desc.Input("X"); auto& inputs = ctx->Input("X");
auto type = block->Var(inputs.front())->GetType(); auto type = ctx->GetType(inputs.front());
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = ctx->Output("Out").front();
block->Var(out_var_name)->SetType(type); ctx->SetType(out_var_name, type);
} }
}; };
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include <iterator> #include <iterator>
#include <memory>
#include <queue>
#include <sstream>
#include <stack> #include <stack>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -141,20 +144,18 @@ void InplacePass::InitSSAGraphNodes() const { ...@@ -141,20 +144,18 @@ void InplacePass::InitSSAGraphNodes() const {
} }
} }
std::unique_ptr<ir::Graph> InplacePass::ApplyImpl( void InplacePass::ApplyImpl(ir::Graph* graph) const {
std::unique_ptr<ir::Graph> graph) const {
var_nodes_.clear(); var_nodes_.clear();
view_.Build(graph.get()); view_.Build(graph);
InitSSAGraphNodes(); InitSSAGraphNodes();
auto cnt = 0;
for (auto* op : view_.AllOps()) { for (auto* op : view_.AllOps()) {
VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
continue; continue;
TryInplaceOpInputOutput(op, graph.get()); TryInplaceOpInputOutput(op, graph);
} }
graph->ResolveHazard(var_nodes_);
return graph;
} }
void InplacePass::InplaceModifyDesc(const std::string& var, void InplacePass::InplaceModifyDesc(const std::string& var,
...@@ -166,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var, ...@@ -166,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
auto* op_desc = op->Op(); auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var); op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var); op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
op_desc->Flush(); op_desc->Flush();
} }
} }
...@@ -263,9 +264,8 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ...@@ -263,9 +264,8 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op, void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const { ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name(); VLOG(4) << "Try to inplace op " << op->Name();
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
"op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced. // some pre-requirments need to meet if the op want to inplaced.
PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
auto* op_desc = op->Op(); auto* op_desc = op->Op();
auto& infer_inplace = auto& infer_inplace =
...@@ -276,21 +276,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ...@@ -276,21 +276,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
PADDLE_ENFORCE(static_cast<bool>(infer_inplace), PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
"%s's infer_inplace has not been registered", op_desc->Type()); "%s's infer_inplace has not been registered", op_desc->Type());
auto* block = op_desc->Block(); auto in_to_outs = infer_inplace(*op_desc);
auto in_to_outs = infer_inplace(*op_desc, block);
auto& all_ops = view_.AllOps(); auto& all_ops = view_.AllOps();
auto cursor = std::find(all_ops.begin(), all_ops.end(), op); auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
size_t idx = std::distance(all_ops.begin(), cursor); size_t idx = std::distance(all_ops.begin(), cursor);
for (auto& pair : in_to_outs) { for (auto& pair : in_to_outs) {
auto& in_var_name = pair.first; auto& in_para_name = pair.first;
auto& out_var_name = pair.second; auto& out_para_name = pair.second;
auto input_vars = op->Op()->Input(in_para_name);
if (!input_vars.size()) {
VLOG(4) << "Parameter " << in_para_name << " is empty skip "
<< in_para_name << " => " << out_para_name << " pair";
continue;
}
auto output_vars = op->Op()->Output(out_para_name);
if (!output_vars.size()) {
VLOG(4) << "Parameter " << out_para_name << " is empty skip "
<< in_para_name << " => " << out_para_name << " pair";
continue;
}
auto in_var_name = input_vars.at(0);
auto out_var_name = output_vars.at(0);
auto* in_node = view_.GetNodeByName(in_var_name, op->inputs); auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
auto* out_node = view_.GetNodeByName(out_var_name, op->outputs); auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
bool can_replace = true;
if (in_var_name == out_var_name) {
can_replace = false;
VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable "
<< out_var_name << " are the same";
} else if (!NodeCanReused(in_node)) {
can_replace = false;
VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
} else if (!NodeCanReused(out_node)) {
can_replace = false;
VLOG(4) << "SKIP: Output variable " << out_var_name
<< " cannot be reused";
} else if (details::NodeSize(*in_node->Var()) !=
details::NodeSize(*out_node->Var())) {
can_replace = false;
VLOG(4) << "SKIP: Input and Output varialbe size not match";
}
if (!can_replace) continue;
// 2. there is no external pending op on the input node // 2. there is no external pending op on the input node
if (view_.PendingOpsOnVar(in_node).size() > 1) { // if (view_.PendingOpsOnVar(in_node).size() > 1) {
if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
VLOG(4) << string::Sprintf( VLOG(4) << string::Sprintf(
"Skiped pair %s => %s. %s input has external dependency." "Skiped pair %s => %s. %s input has external dependency."
"inplace such pair will overwrite the memory.", "inplace such pair will overwrite the memory.",
...@@ -337,6 +374,98 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ...@@ -337,6 +374,98 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
} }
} }
void GraphView::TopoSort(ir::Graph* graph) {
//
ops_.clear();
auto deps_num = [](ir::Node* op) {
auto cnt = 0;
for (auto& var : op->inputs)
if (var->inputs.size() > 0) ++cnt;
return cnt;
};
std::queue<std::pair<ir::Node*, uint32_t>> ready_ops;
int level = 0;
auto nodes = graph->Nodes();
std::unordered_map<ir::Node*, uint32_t> deps_map;
for (auto& node : nodes) {
if (node->IsOp() && node->Op() != nullptr) {
deps_map[node] = deps_num(node);
if (0 == deps_map[node]) {
ready_ops.push({node, level});
}
}
}
while (!ready_ops.empty()) {
auto item = ready_ops.front();
ready_ops.pop();
ops_.emplace_back(item.first);
// record level when pop from queue
op_level_[item.first] = item.second;
for (auto node : item.first->outputs) {
for (auto op : node->outputs) {
--deps_map[op];
if (deps_map[op] == 0) ready_ops.push({op, item.second + 1});
}
}
}
bool all_ops_checked = true;
for (auto& node : nodes) {
if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
all_ops_checked = false;
break;
}
}
PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis");
}
// return true if current op node depeneds on all other op that use the same
// variable node
bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
// get op list that rely on the same variable
auto op_list = var->outputs;
for (auto& op : op_list) {
if (op == current_op) continue;
VLOG(4) << " GraphView::CheckDeps : " << op->Name() << " & "
<< current_op->Name();
if (!CheckOpDeps(op, current_op)) return false;
VLOG(4) << "";
}
return true;
}
// check if op2 depends on op1's output
bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
if (VLOG_IS_ON(4)) {
auto print_op = [&](ir::Node* op, const char* name) {
std::ostringstream os;
os << " " << name << " : " << op->Name() << " ";
os << "Input args : ";
for (auto& arg : op->inputs) os << arg->Name() << " ";
os << "Output args : ";
for (auto& arg : op->outputs) os << arg->Name() << " ";
os << "Level : " << op_level_.at(op);
VLOG(4) << os.str();
};
print_op(op1, "OP1");
print_op(op2, "OP2");
}
if (op1 == op2) return true;
if (op_level_.at(op1) >= op_level_.at(op2)) return false;
for (auto& var : op2->inputs)
if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true;
return false;
}
ir::Node* GraphView::GetNodeByName(const std::string& name, ir::Node* GraphView::GetNodeByName(const std::string& name,
const std::vector<ir::Node*>& nodes) const { const std::vector<ir::Node*>& nodes) const {
// nodes should be op->inputs/outputs // nodes should be op->inputs/outputs
...@@ -382,22 +511,7 @@ void GraphView::Build(ir::Graph* g) { ...@@ -382,22 +511,7 @@ void GraphView::Build(ir::Graph* g) {
// Because we insert some new created node. Which may have data race between // Because we insert some new created node. Which may have data race between
// nodes. // nodes.
// resolve data harzards depends on the var nodes in right order. // resolve data harzards depends on the var nodes in right order.
ops_ = SortOpLikeDescOrder(*g); TopoSort(g);
// 1. track the nodes which reused previous node in Python memory optimize.
// these node can not be inplaced, otherwise may generate a circle in graph.
std::unordered_set<std::string> all_vars;
for (auto& node : g->Nodes()) {
if (node->IsVar()) continue;
for (auto& out : node->outputs) {
if (out->IsCtrlVar() || out->Var() == nullptr) continue;
if (all_vars.count(out->Name())) {
dup_nodes_.emplace(out->Name());
} else {
all_vars.emplace(out->Name());
}
}
}
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -50,10 +51,15 @@ class GraphView { ...@@ -50,10 +51,15 @@ class GraphView {
// map the parameter and gradient, must be skipped. // map the parameter and gradient, must be skipped.
bool InSkipSet(const std::string& var) const; bool InSkipSet(const std::string& var) const;
bool CheckDeps(ir::Node* var, ir::Node* current_op) const;
bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const;
void TopoSort(ir::Graph* g);
private: private:
std::vector<ir::Node*> ops_; std::vector<ir::Node*> ops_;
std::unordered_set<std::string> dup_nodes_; // mem opt affect nodes std::unordered_set<std::string> dup_nodes_; // mem opt affect nodes
std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_; std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
std::unordered_map<ir::Node*, uint32_t> op_level_;
}; };
// swap pairs in sequence // swap pairs in sequence
...@@ -63,8 +69,7 @@ class InplacePass : public ir::Pass { ...@@ -63,8 +69,7 @@ class InplacePass : public ir::Pass {
InplacePass(); InplacePass();
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
......
...@@ -190,7 +190,7 @@ struct NodeComparator { ...@@ -190,7 +190,7 @@ struct NodeComparator {
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) { (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSize(lhs) <= NodeSize(rhs); return NodeSize(lhs) == NodeSize(rhs);
} else { } else {
return false; return false;
} }
...@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) { ...@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory // only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) { type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false; return false;
} }
...@@ -450,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() { ...@@ -450,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() {
live_in_[op].insert(var); live_in_[op].insert(var);
} }
for (auto& var : defs_[op]) { for (auto& var : defs_[op]) {
if (uses_[op].count(var)) continue;
live_in_[op].erase(var); live_in_[op].erase(var);
} }
......
...@@ -143,14 +143,14 @@ TEST(OrderedSet, FindBestFitNode) { ...@@ -143,14 +143,14 @@ TEST(OrderedSet, FindBestFitNode) {
pool.Insert(node.get()); pool.Insert(node.get());
} }
// FindNextBestFitNode
auto* n = nodes[0].get(); auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n); auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a"); ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache); auto* cache_b = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c"); ASSERT_TRUE(cache_b->Name() != cache->Name());
cache = pool.FindNextBestFitNode(n, cache); ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c");
PADDLE_ENFORCE(cache->Name() == "b"); cache = pool.FindNextBestFitNode(n, cache_b);
ASSERT_TRUE(cache == nullptr);
} }
} // namespace details } // namespace details
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include <unordered_set>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -43,8 +44,7 @@ namespace paddle { ...@@ -43,8 +44,7 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
CollectSkipVarsSet(nodes); CollectSkipVarsSet(nodes);
...@@ -112,7 +112,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -112,7 +112,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
RenameVarInGraphDesc(var_name, cache_name, idx); RenameVarInGraphDesc(var_name, cache_name, idx);
RenameVarInGraphNode(var_name, cache_name, idx, graph.get()); RenameVarInGraphNode(var_name, cache_name, idx, graph);
pool_.Erase(cache_name); pool_.Erase(cache_name);
} }
} }
...@@ -127,8 +127,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -127,8 +127,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
} }
graph->ResolveHazard(var_nodes_); graph->ResolveHazard(var_nodes_);
return graph;
} }
void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
...@@ -191,6 +189,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -191,6 +189,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// immediately to make the subblock variable reuse strategy take // immediately to make the subblock variable reuse strategy take
// effect. Because it is a single op in graph. No need to // effect. Because it is a single op in graph. No need to
// update the ir nodes. // update the ir nodes.
// FIXME(liuwei1031): Graph is not aware of the existence of
// BlockDescs and ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform
// on Graph or Node directly!
sub_op_desc->Rename(var->Name(), cache->Name()); sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block() != nullptr && if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) { sub_op_desc->Block()->HasVar(var->Name())) {
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -35,8 +36,7 @@ namespace details { ...@@ -35,8 +36,7 @@ namespace details {
class MemoryOptimizePass : public ir::Pass { class MemoryOptimizePass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
// fill the variable map(var_nodes) by version. // fill the variable map(var_nodes) by version.
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
......
...@@ -34,8 +34,7 @@ static bool IsLockAndRecordEventFreeComputationOpHandle( ...@@ -34,8 +34,7 @@ static bool IsLockAndRecordEventFreeComputationOpHandle(
return true; return true;
} }
std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl( void ModifyOpLockAndRecordEventPass::ApplyImpl(ir::Graph *ir_graph) const {
std::unique_ptr<ir::Graph> ir_graph) const {
auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph); auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph);
OpGraphView graph_view(all_ops); OpGraphView graph_view(all_ops);
for (auto &op : all_ops) { for (auto &op : all_ops) {
...@@ -49,7 +48,6 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl( ...@@ -49,7 +48,6 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
<< compute_op->DebugString(); << compute_op->DebugString();
} }
} }
return ir_graph;
} }
} // namespace details } // namespace details
......
...@@ -23,8 +23,7 @@ namespace details { ...@@ -23,8 +23,7 @@ namespace details {
class ModifyOpLockAndRecordEventPass : public ir::Pass { class ModifyOpLockAndRecordEventPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace details } // namespace details
......
...@@ -23,10 +23,8 @@ namespace details { ...@@ -23,10 +23,8 @@ namespace details {
class SSAGraghBuilderWithChecker : public ir::Pass { class SSAGraghBuilderWithChecker : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph *graph) const override {
std::unique_ptr<ir::Graph> graph) const override { PADDLE_ENFORCE(IsValidGraph(graph));
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
} }
bool IsValidGraph(const ir::Graph *graph) const { bool IsValidGraph(const ir::Graph *graph) const {
......
...@@ -11,18 +11,20 @@ ...@@ -11,18 +11,20 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h" #include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
...@@ -30,6 +32,7 @@ ...@@ -30,6 +32,7 @@
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -134,21 +137,25 @@ void AddOutputToLeafOps(ir::Graph *graph) { ...@@ -134,21 +137,25 @@ void AddOutputToLeafOps(ir::Graph *graph) {
} }
} // namespace } // namespace
void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {}
void MultiDevSSAGraphBuilderBase::Init() const { void MultiDevSSAGraphBuilderBase::Init() const {
all_vars_.clear(); all_vars_.clear();
loss_var_name_ = Get<const std::string>(kLossVarName); loss_var_name_ = Get<const std::string>(kLossVarName);
VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_;
places_ = Get<const std::vector<platform::Place>>(kPlaces); places_ = Get<const std::vector<platform::Place>>(kPlaces);
local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes); local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
strategy_ = Get<const BuildStrategy>(kStrategy); strategy_ = Get<const BuildStrategy>(kStrategy);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs"); nccl_ctxs_ = &Get<platform::NCCLContextMap>(kNCCLCtxs);
#endif #endif
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
} }
std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
std::unique_ptr<ir::Graph> graph) const {
Init(); Init();
CheckGraph(*graph);
std::vector<ir::Node *> sorted_ops = SortOperations(*graph); std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
auto nodes = graph->ReleaseNodes(); auto nodes = graph->ReleaseNodes();
...@@ -166,7 +173,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -166,7 +173,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
result.Set(kGraphOps, new GraphOps); result.Set(kGraphOps, new GraphOps);
bool is_forwarding = true; bool is_forwarding = true;
bool insert_collection_ops = NeedCollectiveOps();
for (ir::Node *node : sorted_ops) { for (ir::Node *node : sorted_ops) {
if (DealWithSpecialOp(&result, node)) { if (DealWithSpecialOp(&result, node)) {
...@@ -185,8 +191,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -185,8 +191,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
CreateComputationalOps(&result, node, places_.size()); CreateComputationalOps(&result, node, places_.size());
} }
// Insert collection ops // Insert collective ops if nranks > 1
if (!is_forwarding && insert_collection_ops) { if (!is_forwarding && Get<size_t>(kNRanks) > 1) {
try { try {
bool is_bk_op = bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr( static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
...@@ -200,13 +206,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -200,13 +206,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr( boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName())); OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 0; i < backward_vars.size(); i += 2) { for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i]; auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1]; auto &g_name = backward_vars[i + 1];
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; VLOG(10) << "Bcast " << g_name << " for parameter " << p_name
<< " op_type " << node->Op()->Type();
InsertCollectiveOp(&result, p_name, g_name); if (NeedCollectiveForGrad(g_name, sorted_ops)) {
InsertCollectiveOp(&result, p_name, g_name);
}
} }
} catch (boost::bad_get e) { } catch (boost::bad_get e) {
} }
...@@ -226,8 +233,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -226,8 +233,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
* Only variables should be the leaves of graph. * Only variables should be the leaves of graph.
*/ */
AddOutputToLeafOps(&result); AddOutputToLeafOps(&result);
result.Erase(kGraphOps); result.Erase(kGraphOps);
return graph;
} }
void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
...@@ -258,6 +265,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( ...@@ -258,6 +265,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
} }
} }
bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result,
ir::Node *node) const {
return false;
}
std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations( std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
const ir::Graph &graph) const { const ir::Graph &graph) const {
return ir::TopologySortOperations(graph); return ir::TopologySortOperations(graph);
...@@ -271,8 +283,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const { ...@@ -271,8 +283,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const {
return use_gpu; return use_gpu;
} }
bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad(
return Get<size_t>(kNRanks) > 1; const std::string &grad_name, std::vector<ir::Node *> ops) const {
// if we have allreduce_op for current gradient variable in the graph,
// then we don't need to add allreduce_op_handle for this gradient
// NOTE: This is for the case that all gradients should add collective ops
for (auto *node : ops) {
if (node->Op()->Type() != "allreduce") continue;
for (auto in_name : node->Op()->InputArgumentNames()) {
if (in_name == grad_name) {
return false;
}
}
}
return true;
} }
void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
...@@ -390,8 +414,9 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, ...@@ -390,8 +414,9 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
CreateOpHandleIOs(result, node, dev_id); CreateOpHandleIOs(result, node, dev_id);
} }
void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
ir::Graph *result, const std::string &og) const { const std::string &og,
bool is_encoded) const {
OpHandleBase *op_handle = nullptr; OpHandleBase *op_handle = nullptr;
auto append_allreduce_op = [&]( auto append_allreduce_op = [&](
...@@ -400,7 +425,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( ...@@ -400,7 +425,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
scopes, places, nccl_ctxs_)); scopes, places, nccl_ctxs_, is_encoded,
static_cast<int>(strategy_.trainers_endpoints_.size()) *
places_.size()));
#else #else
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
...@@ -422,12 +449,15 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( ...@@ -422,12 +449,15 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
PADDLE_ENFORCE(!vars.empty()); PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back(); auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad); op_handle->AddInput(prev_grad);
VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString();
auto var = auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
vars.size(), i, og, places_[i]); vars.size(), i, og, places_[i]);
vars.emplace_back(var); vars.emplace_back(var);
op_handle->AddOutput(var); op_handle->AddOutput(var);
VLOG(10) << "all_reduce_op_handle add output " << og
<< ", handle:" << var->DebugString();
} }
} }
...@@ -496,20 +526,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, ...@@ -496,20 +526,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
} }
bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
return boost::get<int>( return !loss_var_name_.empty() && node->Op() &&
boost::get<int>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
(static_cast<int>(OpRole::kBackward) | (static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss)) && static_cast<int>(OpRole::kLoss));
!loss_var_name_.empty(); // If loss_var is empty. This is test mode
} }
bool MultiDevSSAGraphBuilderBase::IsSparseGradient( bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
const std::string &og) const { const std::string &og) const {
PADDLE_ENFORCE(all_vars_.count(og) != 0); PADDLE_ENFORCE(all_vars_.count(og) != 0);
if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
return true;
}
return false;
} }
void AllReduceSSAGraphBuilder::InsertCollectiveOp( void AllReduceSSAGraphBuilder::InsertCollectiveOp(
...@@ -831,9 +858,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { ...@@ -831,9 +858,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
node->Op()->Type()); node->Op()->Type());
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], // Create fetch_barrier op handle to enable output on all devices.
node->Op()->Type(), places_[op_dev_id])); // **NOTE** fetch_barrier should output variables list same as recv op does.
if (node->Op()->Type() == "fetch_barrier") {
result->Get<GraphOps>(kGraphOps).emplace_back(new FetchBarrierOpHandle(
result->CreateOpNode(node->Op()), local_scopes_, places_));
} else {
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id]));
}
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
CreateOpHandleIOs(result, node, op_dev_id); CreateOpHandleIOs(result, node, op_dev_id);
...@@ -912,6 +947,17 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -912,6 +947,17 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
return op_dev_id; return op_dev_id;
} }
bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
auto u_name = p_name + "__dgc_u__";
auto it = all_vars_.find(u_name);
if (it == all_vars_.end()) {
VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
return false;
}
return true;
}
void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
const std::string &p_name, const std::string &p_name,
const std::string &g_name) const { const std::string &g_name) const {
...@@ -927,7 +973,11 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, ...@@ -927,7 +973,11 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
CreateReduceOp(result, g_name, 0); CreateReduceOp(result, g_name, 0);
CreateBroadcastOp(result, g_name, 0); CreateBroadcastOp(result, g_name, 0);
} else { } else {
CreateAllReduceOp(result, g_name); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
CreateAllReduceOp(result, g_name, IsEncoded(p_name));
#else
PADDLE_ENFORCE(false, "Compiled withoud cuda!");
#endif
} }
break; break;
default: default:
...@@ -995,7 +1045,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { ...@@ -995,7 +1045,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
paddle::framework::details::ReduceSSAGraphBuilder); paddle::framework::details::ReduceSSAGraphBuilder);
REGISTER_MULTI_DEVICES_PASS( REGISTER_MULTI_DEVICES_PASS(
allreduce_mode_multi_devices_pass, all_reduce_mode_multi_devices_pass,
paddle::framework::details::AllReduceSSAGraphBuilder); paddle::framework::details::AllReduceSSAGraphBuilder);
REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
paddle::framework::details::DistSSAGraphBuilder); paddle::framework::details::DistSSAGraphBuilder);
...@@ -14,10 +14,12 @@ ...@@ -14,10 +14,12 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -32,30 +34,30 @@ class Scope; ...@@ -32,30 +34,30 @@ class Scope;
namespace details { namespace details {
constexpr char kLossVarName[] = "loss_var_name"; constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy"; constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks"; constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass { class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph *graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
virtual void Init() const; virtual void Init() const;
virtual void CheckGraph(const ir::Graph &graph) const;
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const; virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const = 0; const std::string &g_name) const = 0;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const = 0; virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
bool UseGPU() const; bool UseGPU() const;
bool NeedCollectiveOps() const; bool NeedCollectiveForGrad(const std::string &grad_name,
std::vector<ir::Node *> ops) const;
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
...@@ -75,7 +77,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { ...@@ -75,7 +77,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
bool IsSparseGradient(const std::string &og) const; bool IsSparseGradient(const std::string &og) const;
void CreateAllReduceOp(ir::Graph *result, const std::string &og) const; void CreateAllReduceOp(ir::Graph *result, const std::string &og,
bool is_encoded = false) const;
void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
...@@ -109,10 +112,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { ...@@ -109,10 +112,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const; const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
return false;
}
virtual void InsertPostprocessOps(ir::Graph *result) const {} virtual void InsertPostprocessOps(ir::Graph *result) const {}
}; };
...@@ -175,6 +174,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { ...@@ -175,6 +174,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_; mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
mutable bool need_broadcast_var_{false}; mutable bool need_broadcast_var_{false};
bool IsEncoded(const std::string &p_name) const;
}; };
std::unordered_set<std::string> &MultiDevSSAGraphBuilder(); std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <fstream> #include <fstream>
#include <iosfwd> #include <iosfwd>
#include <memory>
#include <ostream> #include <ostream>
#include <string> #include <string>
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -40,13 +41,11 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { ...@@ -40,13 +41,11 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
class SSAGraghBuilderWithPrinter : public ir::Pass { class SSAGraghBuilderWithPrinter : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override {
std::unique_ptr<ir::Graph> graph) const override {
std::unique_ptr<std::ostream> fout( std::unique_ptr<std::ostream> fout(
new std::ofstream(Get<std::string>(kGraphvizPath))); new std::ofstream(Get<std::string>(kGraphvizPath)));
PADDLE_ENFORCE(fout->good()); PADDLE_ENFORCE(fout->good());
Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout); Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
return graph;
} }
}; };
......
...@@ -16,8 +16,10 @@ ...@@ -16,8 +16,10 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
...@@ -38,11 +40,32 @@ namespace details { ...@@ -38,11 +40,32 @@ namespace details {
// `std::vector<VarHandle*>` is the version of varaibles. // `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>> typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars; GraphVars;
const char kGraphVars[] = "vars"; constexpr char kGraphVars[] = "vars";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
// aux variables to represent dependency. Useful to resolve data hazard. // aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase *> GraphDepVars; typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars"; constexpr char kGraphDepVars[] = "dep_vars";
typedef std::unordered_set<std::string> FusedVars;
constexpr char kFusedVars[] = "fused_vars";
constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef std::string FusedOptType;
constexpr char kFusedOptType[] = "fused_opt_type";
typedef std::string FusedGrads;
constexpr char kFusedGrads[] = "fused_gradients";
typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndGrads[] = "params_grads";
typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupGradsAndParams;
constexpr char kGroupGradsAndParams[] = "group_grads_params";
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include <map> #include <map>
#include <unordered_set>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() { ...@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() {
void OpHandleBase::Run(bool use_cuda) { void OpHandleBase::Run(bool use_cuda) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (events_.empty() && use_cuda) { if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device; int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE( PADDLE_ENFORCE(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
int dev_id =
boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
out_var_handle->SetGenerateEvent(events_.at(dev_id));
}
}
} else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
"%s should have only one dev_ctx.", Name());
auto &place = dev_ctxes_.begin()->first;
int dev_id = boost::get<platform::CUDAPlace>(place).device;
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
PADDLE_ENFORCE(
platform::is_same_place(place, out_var_handle->place()),
"The place of input(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name());
out_var_handle->SetGenerateEvent(events_.at(dev_id));
}
}
}
} }
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE(!use_cuda);
#endif #endif
...@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) { ...@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
void OpHandleBase::WaitInputVarGenerated() { void OpHandleBase::WaitInputVarGenerated() {
for (auto in_var : inputs_) { for (auto in_var : inputs_) {
if (NeedWait(in_var)) { if (NeedWait(in_var)) {
for (auto &pair : dev_ctxes_) { // Dummy Variable is used to represent dependencies between operators, so
in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second); // there doesn't add event for it.
auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
if (in_var_handle) {
auto &place = in_var_handle->place();
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream();
PADDLE_ENFORCE(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
#endif
}
// There are nothing to do when the place is CPUPlace.
} }
} }
} }
} }
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) { for (auto in_var : inputs_) {
if (NeedWait(in)) { if (NeedWait(in_var)) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place)); // Dummy Variable is used to represent dependencies between operators, so
// there doesn't add event for it.
auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
if (in_var_handle) {
if (platform::is_gpu_place(in_var_handle->place())) {
#ifdef PADDLE_WITH_CUDA
auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place()))
->stream();
PADDLE_ENFORCE(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
#endif
}
// There are nothing to do when the place is CPUPlace.
}
} }
} }
} }
......
...@@ -16,9 +16,12 @@ limitations under the License. */ ...@@ -16,9 +16,12 @@ limitations under the License. */
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
...@@ -34,27 +37,86 @@ enum OpInfoFillType { ...@@ -34,27 +37,86 @@ enum OpInfoFillType {
kGradOpDescMaker = 2, kGradOpDescMaker = 2,
kVarTypeInference = 3, kVarTypeInference = 3,
kShapeInference = 4, kShapeInference = 4,
kInplaceOpInference = 5 kInplaceOpInference = 5,
kNoNeedBufferVarsInference = 6,
kUnknown = -1
}; };
namespace internal {
template <typename T, OpInfoFillType kType>
struct TypePair {
using Type = T;
static constexpr OpInfoFillType kFillType = kType;
};
using OpRegistryClasses = std::tuple< // NOLINT
TypePair<OperatorBase, kOperator>, // NOLINT
TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>, // NOLINT
TypePair<GradOpDescMakerBase, kGradOpDescMaker>, // NOLINT
TypePair<VarTypeInference, kVarTypeInference>, // NOLINT
TypePair<InferShapeBase, kShapeInference>, // NOLINT
TypePair<InplaceOpInference, kInplaceOpInference>, // NOLINT
TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference> // NOLINT
>;
static constexpr int kOpRegistryClassNumber =
std::tuple_size<OpRegistryClasses>::value;
template <typename T, int kPos, bool kIsBounded /* = true*/>
struct IsMatchedBaseTypeImpl {
using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type;
static constexpr bool kValue =
std::is_base_of<typename PairType::Type, T>::value;
};
template <typename T, int kPos>
struct IsMatchedBaseTypeImpl<T, kPos, false> {
static constexpr bool kValue = false;
};
template <typename T, int kPos>
static inline constexpr bool IsMatchedBaseType() {
return IsMatchedBaseTypeImpl<
T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue;
}
template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched>
struct OpInfoFillTypeGetterImpl {};
// This case should not happen
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {};
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> {
static constexpr OpInfoFillType kType = kUnknown;
};
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> {
static constexpr OpInfoFillType kType =
OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd,
IsMatchedBaseType<T, kStart + 1>()>::kType;
};
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> {
using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type;
static constexpr OpInfoFillType kType = PairType::kFillType;
};
template <typename T>
using OpInfoFillTypeGetter =
OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber,
kOpRegistryClassNumber == 0,
IsMatchedBaseType<T, 0>()>;
} // namespace internal
template <typename T> template <typename T>
struct OpInfoFillTypeID { struct OpInfoFillTypeID {
static constexpr OpInfoFillType ID() { static constexpr OpInfoFillType ID() {
return std::is_base_of<OperatorBase, T>::value return internal::OpInfoFillTypeGetter<T>::kType;
? kOperator
: (std::is_base_of<OpProtoAndCheckerMaker, T>::value
? kOpProtoAndCheckerMaker
: (std::is_base_of<GradOpDescMakerBase, T>::value
? kGradOpDescMaker
: (std::is_base_of<VarTypeInference, T>::value
? kVarTypeInference
: (std::is_base_of<InferShapeBase, T>::value
? kShapeInference
: (std::is_base_of<
InplaceOpInference, T>::value
? kInplaceOpInference
: static_cast<OpInfoFillType>(
-1))))));
} }
}; };
...@@ -127,9 +189,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> { ...@@ -127,9 +189,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
template <typename T> template <typename T>
struct OpInfoFiller<T, kVarTypeInference> { struct OpInfoFiller<T, kVarTypeInference> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) { info->infer_var_type_ = [](InferVarTypeContext* context) {
T inference; T inference;
inference(fwd_op, block); inference(context);
}; };
} }
}; };
...@@ -147,9 +209,21 @@ struct OpInfoFiller<T, kShapeInference> { ...@@ -147,9 +209,21 @@ struct OpInfoFiller<T, kShapeInference> {
template <typename T> template <typename T>
struct OpInfoFiller<T, kInplaceOpInference> { struct OpInfoFiller<T, kInplaceOpInference> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) { info->infer_inplace_ = [](const OpDesc& op_desc) {
T infer; T infer;
return infer(op_desc, block); return infer(op_desc);
};
}
};
template <typename T>
struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
void operator()(const char* op_type, OpInfo* info) const {
info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs,
const VariableNameMap& outputs,
const AttributeMap& attrs) {
T infer(inputs, outputs, attrs);
return infer();
}; };
} }
}; };
......
...@@ -96,7 +96,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -96,7 +96,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
auto seq_allreduce_pass = auto seq_allreduce_pass =
ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
for (size_t i = 0; i < graphs_.size(); ++i) { for (size_t i = 0; i < graphs_.size(); ++i) {
graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i])); graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release()));
} }
// set the correct size of thread pool to each device. // set the correct size of thread pool to each device.
......
...@@ -53,6 +53,31 @@ struct ReduceLoDTensor { ...@@ -53,6 +53,31 @@ struct ReduceLoDTensor {
} }
}; };
struct ReduceBufferData {
const std::vector<const void *> &src_data_;
void *dst_data_;
int64_t numel_;
ReduceBufferData(const std::vector<const void *> &src, void *dst,
int64_t numel)
: src_data_(src), dst_data_(dst), numel_(numel) {}
template <typename T>
void apply() const {
T *dst_data = reinterpret_cast<T *>(dst_data_);
for (size_t i = 0; i < src_data_.size(); ++i) {
auto srd_data = reinterpret_cast<const T *>(src_data_[i]);
VLOG(10) << "dst: " << dst_data_ << ", " << srd_data;
if (srd_data == dst_data_) {
continue;
}
std::transform(srd_data, srd_data + numel_, dst_data, dst_data,
[](T a, T b) -> T { return a + b; });
}
}
};
inline void GatherLocalSelectedRows( inline void GatherLocalSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_, const std::vector<const SelectedRows *> &src_selecte_rows_,
const std::vector<platform::Place> &in_places, const std::vector<platform::Place> &in_places,
......
...@@ -193,8 +193,80 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, ...@@ -193,8 +193,80 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
return shrink_func(computation_op); return shrink_func(computation_op);
} }
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( /**
std::unique_ptr<ir::Graph> graph) const { * Shrink op dependencies according to no need buffer vars.
*
* If some ops do not need Tensor buffer of any input,
* just remove the dependency of this op, i.e, decrease reference count.
*
* For example, input Y of elementwise_add_grad op is only used to infer shape
* and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
* input Y can be collected before elementwise_add_grad op runs.
*
* This method returns whether the dependency count decreases to 0, and
* shrinks op dependency if possible.
*/
static bool ShrinkNoNeedBufferVarOpDependency(
const std::string &var_name,
std::unordered_set<ComputationOpHandle *> *op_handles) {
std::vector<ComputationOpHandle *> skip_ops;
for (auto *op_handle : *op_handles) {
auto *op_base = op_handle->GetOp();
auto &inferer = op_base->Info().NoNeedBufferVarsInferer();
if (!inferer) {
continue;
}
std::unordered_set<std::string> no_need_buffer_vars =
inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
// Check whether var_name occurs in other inputs or outputs of the op
// If it occurs, we cannot decrease the dependency number.
bool occurred_in_other_vars = false;
for (auto &in_pair : op_base->Inputs()) {
if (no_need_buffer_vars.count(in_pair.first) > 0) {
continue;
}
auto &args = in_pair.second;
auto iter = std::find(args.begin(), args.end(), var_name);
if (iter != args.end()) {
occurred_in_other_vars = true;
break;
}
}
if (occurred_in_other_vars) {
continue;
}
for (auto &out_pair : op_base->Outputs()) {
auto &args = out_pair.second;
auto iter = std::find(args.begin(), args.end(), var_name);
if (iter != args.end()) {
occurred_in_other_vars = true;
break;
}
}
if (!occurred_in_other_vars) {
VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name();
skip_ops.emplace_back(op_handle);
}
}
if (skip_ops.size() == op_handles->size()) {
op_handles->clear();
return true;
} else {
for (auto *skip_op : skip_ops) {
op_handles->erase(skip_op);
}
return false;
}
}
void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount); auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
auto &last_live_ops_of_vars = auto &last_live_ops_of_vars =
Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars); Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
...@@ -229,21 +301,46 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -229,21 +301,46 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
continue; continue;
} }
bool ok; auto &var_name = name_var_pair.first;
auto result = ExtractComputationOpFromLastLivedVar( auto &var_handles = name_var_pair.second;
name_var_pair.second.back(), i, shrink_func, &ok);
for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
++iter) {
bool ok;
auto result =
ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok);
// Seldomly, some vars may have no pending or preceding computation ops
// Just break;
if (!ok) break;
VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
size_t original_op_deps = result.size();
// If all ops do not need buffer of var_name, calculate reference count
// of the previous version of var_name.
if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) {
VLOG(10) << "Try to precede reference count computing at var "
<< var_name;
continue;
}
size_t final_op_deps = result.size();
if (final_op_deps < original_op_deps) {
VLOG(5) << "Shrink op deps from " << original_op_deps << " to "
<< final_op_deps;
}
if (ok) {
auto &var_name = name_var_pair.first;
PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
var_name); var_name);
ref_cnts[i].emplace(var_name, result.size()); ref_cnts[i].emplace(var_name, result.size());
last_live_ops_of_vars[i].emplace(var_name, std::move(result)); last_live_ops_of_vars[i].emplace(var_name, std::move(result));
break;
} }
// Seldomly, all preceding trying failed.
// Just skip this corner case
} }
} }
return graph;
} }
} // namespace details } // namespace details
......
...@@ -23,8 +23,7 @@ namespace details { ...@@ -23,8 +23,7 @@ namespace details {
class ReferenceCountPass : public ir::Pass { class ReferenceCountPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace details } // namespace details
......
...@@ -29,8 +29,7 @@ static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) { ...@@ -29,8 +29,7 @@ static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
op1->Outputs() == op2->Outputs(); op1->Outputs() == op2->Outputs();
} }
std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( void SequentialExecutionPass::ApplyImpl(ir::Graph *graph) const {
std::unique_ptr<ir::Graph> graph) const {
// FIXME(zjl): Insert dependencies between some distributed ops may cause // FIXME(zjl): Insert dependencies between some distributed ops may cause
// the multi_devices_graph_pass fails. So we skip these ops here. // the multi_devices_graph_pass fails. So we skip these ops here.
// Indeed, maybe we should not insert dependencies between these ops // Indeed, maybe we should not insert dependencies between these ops
...@@ -98,7 +97,6 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( ...@@ -98,7 +97,6 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name() VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name(); << " and " << op_node_list[i]->Name();
} }
return graph;
} }
} // namespace details } // namespace details
......
...@@ -23,8 +23,7 @@ namespace details { ...@@ -23,8 +23,7 @@ namespace details {
class SequentialExecutionPass : public ir::Pass { class SequentialExecutionPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace details } // namespace details
......
...@@ -15,18 +15,20 @@ ...@@ -15,18 +15,20 @@
#pragma once #pragma once
#include <deque> #include <deque>
#include <functional>
#include <list> #include <list>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party #include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -36,6 +38,12 @@ class Scope; ...@@ -36,6 +38,12 @@ class Scope;
namespace details { namespace details {
struct OpDependentData {
std::unordered_map<OpHandleBase *, size_t> pending_ops_;
std::unordered_set<VarHandleBase *> pending_vars_;
std::unordered_set<OpHandleBase *> ready_ops_;
};
class ThreadedSSAGraphExecutor : public SSAGraphExecutor { class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
public: public:
ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
...@@ -55,33 +63,38 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -55,33 +63,38 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details::OpHandleBase *op); details::OpHandleBase *op);
private: private:
// Note(zcd): the ThreadPool should be placed last so that ThreadPool should
// be destroyed first.
ir::Graph *graph_; ir::Graph *graph_;
std::unique_ptr<::ThreadPool> pool_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
ExceptionHolder exception_holder_; ExceptionHolder exception_holder_;
std::atomic<int> running_ops_; std::unique_ptr<OpDependentData> op_deps_;
std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
ExecutionStrategy strategy_;
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
::ThreadPool prepare_pool_;
std::unique_ptr<::ThreadPool> pool_;
void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops, void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
OpHandleBase *op_instance) const; OpHandleBase *op_instance) const;
void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars, void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars, std::unordered_set<VarHandleBase *> *ready_vars,
VarHandleBase *var) const; VarHandleBase *var) const;
void InsertFetchOps(const std::vector<std::string> &fetch_tensors, void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
std::vector<FetchOpHandle *> *fetch_ops, std::vector<FetchOpHandle *> *fetch_ops,
std::unordered_set<VarHandleBase *> *fetch_dependencies, std::unordered_set<VarHandleBase *> *fetch_dependencies,
std::unordered_set<OpHandleBase *> *ready_ops,
std::unordered_map<OpHandleBase *, size_t> *pending_ops, std::unordered_map<OpHandleBase *, size_t> *pending_ops,
std::unordered_set<VarHandleBase *> *pending_vars, std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars,
FeedFetchList *fetch_data); FeedFetchList *fetch_data);
private: void PrepareOpDeps();
ExecutionStrategy strategy_; void CopyOpDeps();
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
}; };
} // namespace details } // namespace details
......
...@@ -24,7 +24,8 @@ VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } ...@@ -24,7 +24,8 @@ VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
std::string VarHandle::DebugString() const { std::string VarHandle::DebugString() const {
std::stringstream ss; std::stringstream ss;
ss << name_ << ":" << place_; ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_
<< ", scope_idx:" << scope_idx_;
return ss.str(); return ss.str();
} }
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -22,8 +22,7 @@ namespace ir { ...@@ -22,8 +22,7 @@ namespace ir {
class AttentionLSTMFusePass : public FusePassBase { class AttentionLSTMFusePass : public FusePassBase {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( void ApplyImpl(ir::Graph* graph) const override;
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册