提交 4a443ffc 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) ...@@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
...@@ -261,6 +262,12 @@ if (WITH_PROFILER) ...@@ -261,6 +262,12 @@ if (WITH_PROFILER)
add_definitions(-DWITH_GPERFTOOLS) add_definitions(-DWITH_GPERFTOOLS)
endif() endif()
if (WITH_JEMALLOC)
find_package(JeMalloc REQUIRED)
include_directories(${JEMALLOC_INCLUDE_DIR})
add_definitions(-DWITH_JEMALLOC)
endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(package) # set paddle packages include(package) # set paddle packages
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
......
# - Find JeMalloc library
# Find the native JeMalloc includes and library
#
# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
# JEMALLOC_FOUND - True if jemalloc found.
find_path(JEMALLOC_INCLUDE_DIR
NAMES jemalloc/jemalloc.h
HINTS ${JEMALLOC_ROOT_DIR}/include)
find_library(JEMALLOC_LIBRARIES
NAMES jemalloc
HINTS ${JEMALLOC_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
mark_as_advanced(
JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR)
...@@ -134,6 +134,7 @@ if(WITH_GPU) ...@@ -134,6 +134,7 @@ if(WITH_GPU)
message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF") message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
endif() endif()
add_definitions(-DWITH_ANAKIN)
endif() endif()
if(WITH_ANAKIN) if(WITH_ANAKIN)
# NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
......
...@@ -2,7 +2,7 @@ if(NOT WITH_GPU) ...@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
return() return()
endif() endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75")
set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
...@@ -59,7 +59,7 @@ endfunction() ...@@ -59,7 +59,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable) # select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable) function(select_nvcc_arch_flags out_variable)
# List of arch names # List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
set(archs_name_default "All") set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto") list(APPEND archs_names "Auto")
...@@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable) ...@@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable)
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70") set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "All") elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs}) set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
......
...@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) ...@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "v0.10.1") SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
......
...@@ -115,6 +115,10 @@ function(common_link TARGET_NAME) ...@@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
if (WITH_PROFILER) if (WITH_PROFILER)
target_link_libraries(${TARGET_NAME} gperftools::profiler) target_link_libraries(${TARGET_NAME} gperftools::profiler)
endif() endif()
if (WITH_JEMALLOC)
target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
endif()
endfunction() endfunction()
......
...@@ -184,7 +184,7 @@ endif() ...@@ -184,7 +184,7 @@ endif()
target_link_libraries(executor garbage_collector) target_link_libraries(executor garbage_collector)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
graph build_strategy graph build_strategy
fast_threaded_ssa_graph_executor variable_helper) fast_threaded_ssa_graph_executor variable_helper)
......
...@@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT ...@@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context) simple_threadpool device_context)
cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle) device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
......
...@@ -19,6 +19,13 @@ ...@@ -19,6 +19,13 @@
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
// asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049
DEFINE_bool(
sync_nccl_allreduce, false,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -48,17 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -48,17 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (NoDummyInputSize() == 1 &&
local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
#else
if (NoDummyInputSize() == 1) {
#endif
return; // No need to all reduce when GPU count = 1;
} else {
// Wait input done
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
...@@ -105,16 +101,32 @@ void AllReduceOpHandle::RunImpl() { ...@@ -105,16 +101,32 @@ void AllReduceOpHandle::RunImpl() {
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
all_reduce_calls.emplace_back([=] { all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
ncclSum, comm, stream)); comm, stream));
}); });
} }
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard; platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) { for (auto &call : all_reduce_calls) {
call(); call();
} }
}
}); });
if (FLAGS_sync_nccl_allreduce) {
for (auto &p : places_) {
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream();
cudaStreamSynchronize(stream);
}
}
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
...@@ -143,7 +155,6 @@ void AllReduceOpHandle::RunImpl() { ...@@ -143,7 +155,6 @@ void AllReduceOpHandle::RunImpl() {
}); });
} }
} }
}
} }
std::string AllReduceOpHandle::Name() const { return "all_reduce"; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h"
...@@ -31,7 +31,11 @@ namespace framework { ...@@ -31,7 +31,11 @@ namespace framework {
namespace details { namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); // Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang.
return (!strategy.enable_sequential_execution_ &&
strategy.num_trainers_ > 1) ||
strategy.enable_parallel_graph_;
} }
class ParallelExecutorPassBuilder : public ir::PassBuilder { class ParallelExecutorPassBuilder : public ir::PassBuilder {
...@@ -82,12 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -82,12 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
if (strategy.memory_optimize_) { if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass"); auto analysis_var_pass = AppendPass("analysis_var_pass");
} }
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass"); AppendMultiDevPass(strategy);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
multi_devices_pass->Set<int>("num_trainers",
new int(strategy_.num_trainers_));
// Add a graph print pass to record a graph with device info. // Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
...@@ -113,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -113,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
} }
} }
// Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass;
if (strategy_.is_distribution_) {
multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else {
PADDLE_THROW("Unknown reduce strategy.");
}
}
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
}
private: private:
BuildStrategy strategy_; BuildStrategy strategy_;
}; };
...@@ -129,9 +148,14 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy( ...@@ -129,9 +148,14 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
return pass_builder_; return pass_builder_;
} }
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply( std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places, const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes, const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else #else
...@@ -142,19 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -142,19 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program)); std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) { for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") { if (IsMultiDevPass(pass->Type())) {
pass->Erase("places"); pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase("loss_var_name"); pass->Erase(kLossVarName);
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name); pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
pass->Erase("local_scopes"); pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes", pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes); &local_scopes);
pass->Erase(kNRanks);
pass->Set<size_t>(kNRanks, new size_t(nranks));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs"); pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif #endif
} else if (pass->Type() == "analysis_var_pass") { } else if (pass->Type() == "analysis_var_pass") {
const std::vector<OpDesc *> *all_op_descs = const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps()); new std::vector<OpDesc *>(main_program.Block(0).AllOps());
...@@ -195,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -195,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS(fuse_elewise_add_act_pass); USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass); USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass); USE_PASS(reduce_mode_multi_devices_pass);
USE_PASS(allreduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass); USE_PASS(analysis_var_pass);
......
...@@ -74,8 +74,6 @@ struct BuildStrategy { ...@@ -74,8 +74,6 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false};
bool memory_optimize_{false}; bool memory_optimize_{false};
bool memory_early_delete_{false}; bool memory_early_delete_{false};
...@@ -84,6 +82,10 @@ struct BuildStrategy { ...@@ -84,6 +82,10 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false}; bool fuse_broadcast_op_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
bool is_distribution_{false};
int num_trainers_{1}; int num_trainers_{1};
int trainer_id_{0}; int trainer_id_{0};
std::vector<std::string> trainers_endpoints_; std::vector<std::string> trainers_endpoints_;
...@@ -104,12 +106,15 @@ struct BuildStrategy { ...@@ -104,12 +106,15 @@ struct BuildStrategy {
bool IsFinalized() const { return is_finalized_; } bool IsFinalized() const { return is_finalized_; }
bool IsMultiDevPass(const std::string &pass_name) const;
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program, std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, const bool use_cuda,
platform::NCCLContextMap *nccl_ctxs) const; platform::NCCLContextMap *nccl_ctxs) const;
...@@ -117,6 +122,13 @@ struct BuildStrategy { ...@@ -117,6 +122,13 @@ struct BuildStrategy {
const bool use_cuda) const; const bool use_cuda) const;
#endif #endif
// If set true, ParallelExecutor would build the main_program into multiple
// graphs,
// each of the graphs would run with one device. This approach can achieve
// better performance
// on some scenarios.
mutable bool enable_parallel_graph_ = false;
private: private:
mutable bool is_finalized_ = false; mutable bool is_finalized_ = false;
mutable std::shared_ptr<ir::PassBuilder> pass_builder_; mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
...@@ -21,7 +21,15 @@ namespace paddle { ...@@ -21,7 +21,15 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
}
bool IsValidGraph(const ir::Graph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars; std::unordered_set<VarHandleBase *> ready_vars;
...@@ -82,7 +90,9 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { ...@@ -82,7 +90,9 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
ready_vars.clear(); ready_vars.clear();
} }
return true; return true;
} }
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -134,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) { ...@@ -134,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) {
} }
} // namespace } // namespace
static const char kLossVarName[] = "loss_var_name"; void MultiDevSSAGraphBuilderBase::Init() const {
static const char kPlaces[] = "places";
static const char kLocalScopes[] = "local_scopes";
static const char kStrategy[] = "strategy";
static const char kNumTrainers[] = "num_trainers";
void MultiDevSSAGraphBuilder::Init() const {
all_vars_.clear(); all_vars_.clear();
balance_vars_.clear();
loss_var_name_ = Get<const std::string>(kLossVarName); loss_var_name_ = Get<const std::string>(kLossVarName);
places_ = Get<const std::vector<platform::Place>>(kPlaces); places_ = Get<const std::vector<platform::Place>>(kPlaces);
...@@ -151,31 +144,16 @@ void MultiDevSSAGraphBuilder::Init() const { ...@@ -151,31 +144,16 @@ void MultiDevSSAGraphBuilder::Init() const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs"); nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
#endif #endif
balance_vars_.resize(places_.size(), 0);
if (strategy_.enable_data_balance_ && places_.size() == 1) {
LOG(WARNING) << "It is no need to enable data balance when there is only "
"one place. enable_data_balance is set to False.";
strategy_.enable_data_balance_ = false;
}
} }
std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
Init(); Init();
// Give the topology sort order and rebuild the graph structure. std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(*graph);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
sorted_ops = SortForReduceMode(sorted_ops);
}
auto nodes = graph->ReleaseNodes(); auto nodes = graph->ReleaseNodes();
ir::Graph &result = *graph; ir::Graph &result = *graph;
int num_trainers = Get<int>(kNumTrainers);
for (auto &node : nodes) { for (auto &node : nodes) {
if (node->IsVar() && node->Var()) { if (node->IsVar() && node->Var()) {
all_vars_.emplace(node->Name(), node->Var()); all_vars_.emplace(node->Name(), node->Var());
...@@ -187,142 +165,57 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -187,142 +165,57 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphDepVars, new GraphDepVars);
result.Set(kGraphOps, new GraphOps); result.Set(kGraphOps, new GraphOps);
std::vector<std::unordered_set<std::string>> bcast_var_name_set;
bcast_var_name_set.resize(places_.size());
bool is_forwarding = true; bool is_forwarding = true;
bool is_dist_train = false; bool insert_collection_ops = NeedCollectiveOps();
std::unordered_map<std::string, int> sharded_var_device;
for (ir::Node *node : sorted_ops) { for (ir::Node *node : sorted_ops) {
if (OpHaveRole(*node, OpRole::kRPC)) { if (DealWithSpecialOp(&result, node)) {
int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); continue;
PADDLE_ENFORCE(op_dev_id != -1, } else {
"Can not schedule the RPC operator to the right place."); // This op runs on all devices
if (node->Op()->Type() == "recv") { if (IsScaleLossOp(node)) {
auto recv_vars_attr =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
if (recv_vars_attr[0].find(".block") == std::string::npos) {
bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
}
}
is_dist_train = true;
} else if (OpHaveRole(*node, OpRole::kDist)) {
int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device);
if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
bcast_var_name_set[op_dev_id].emplace(origin_param_name);
}
} else if (IsScaleLossOp(node)) {
// user can customize loss@grad if not use_default_grad_scale_ // user can customize loss@grad if not use_default_grad_scale_
if (strategy_.gradient_scale_ != InsertScaleLossGradOp(&result, node);
BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle?
auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
out_dtype);
}
// This assumes the backward generating code will ensure IsScaleLossOp // This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss. // is true only for the op that scale the final scalar loss.
// It also assumes backward op will always follow the forward op in // It also assumes backward op will always follow the forward op in
// the block. // the block.
is_forwarding = false; is_forwarding = false;
} else {
int op_dev_id = GetOpDeviceID(node, sharded_var_device);
if (op_dev_id != -1) { // This op only runs on one specific device.
CreateComputationalOp(&result, node, op_dev_id);
for (ir::Node *n : node->outputs) {
sharded_var_device.emplace(n->Name(), op_dev_id);
}
} else {
// This op runs on all devices, and its output may have parameter's
// gradients.
// TODO(paddle-dev): Why is so special about "read" op?
if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
node->Op()->SetAttr("throw_eof_exp", false);
CreateComputationalOps(&result, node, places_.size());
const auto &data_var_names = node->Op()->Output("Out");
InsertDataBalanceOp(&result, data_var_names);
} else { } else {
CreateComputationalOps(&result, node, places_.size()); CreateComputationalOps(&result, node, places_.size());
} }
if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { // Insert collection ops
if (!is_forwarding && insert_collection_ops) {
try {
bool is_bk_op = bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr( static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) & OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward)); static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue; if (!is_bk_op) continue;
// Currently, we assume that once gradient is generated, it can be // Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. // broadcast, and each gradient is only broadcast once.
try { auto backward_vars =
auto backward_vars = boost::get<std::vector<std::string>>( boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName())); OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 0; i < backward_vars.size(); i += 2) { for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i]; auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1]; auto &g_name = backward_vars[i + 1];
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
size_t cur_device_id = -1;
switch (strategy_.reduce_) { InsertCollectiveOp(&result, p_name, g_name);
case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = GetAppropriateDeviceID({g_name});
CreateReduceOp(&result, g_name, cur_device_id);
sharded_var_device.emplace(g_name, cur_device_id);
if (!is_dist_train) {
bcast_var_name_set[cur_device_id].emplace(p_name);
}
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(g_name)) {
CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0);
} else {
InsertAllReduceOp(&result, g_name);
}
break;
default:
LOG(FATAL) << "Unknown reduce strategy ";
break;
}
} }
} catch (boost::bad_get e) { } catch (boost::bad_get e) {
} }
} }
} }
} }
}
bool use_gpu = false;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
use_gpu = nccl_ctxs_ != nullptr;
#endif
// Insert broadcast operators principle: InsertPostprocessOps(&result);
// 1. Broadcast optimized parameters in Reduce strategy;
// 2. No need broadcast optimized parameters in AllReduce strategy because of
// the optimization sub-graph would be run on every GPU;
// 3. Allways broadcast received parameters in Distribute Training.
if ((use_gpu &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
is_dist_train) {
if (strategy_.fuse_broadcast_op_) {
CreateFusedBroadcastOp(&result, bcast_var_name_set);
} else {
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id];
for (auto &bcast_name : to_bcast_set) {
CreateBroadcastOp(&result, bcast_name, dev_id);
}
}
}
}
/* /*
Dependency graph has been constructed. However, there are still data Dependency graph has been constructed. However, there are still data
hazards need to be handled. hazards need to be handled.
...@@ -337,65 +230,52 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -337,65 +230,52 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
return graph; return graph;
} }
std::vector<ir::Node *> MultiDevSSAGraphBuilder::SortForReduceMode( void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
const std::vector<ir::Node *> &topo_ops) const { ir::Graph *result, const ir::Node *node) const {
std::unordered_map<std::string, int> sharded_var_device; // user can customize loss@grad if not use_default_grad_scale_
std::vector<ir::Node *> sorted_ops; size_t loss_scale = 0;
std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op; switch (this->strategy_.gradient_scale_) {
sorted_ops.reserve(topo_ops.size()); case BuildStrategy::GradientScaleStrategy::kOne:
loss_scale = 1;
auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { break;
sharded_var_device.emplace(var_name, dev_id); case BuildStrategy::GradientScaleStrategy::kCoeffNumDevice:
if (delayed_op.count(var_name)) { loss_scale = Get<size_t>(kNRanks);
auto &ops = delayed_op.at(var_name); break;
sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); case BuildStrategy::GradientScaleStrategy::kCustomized:
delayed_op.at(var_name).clear(); loss_scale = 0;
break;
default:
LOG(FATAL) << "Unknown gradient scale strategy.";
break;
} }
};
for (ir::Node *node : topo_ops) { if (loss_scale) {
int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); // TODO(paddle-dev): Why is there no input for this op_handle?
if (op_dev_id > -1) { auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
// This op only runs on one specific device. auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType();
sorted_ops.emplace_back(node); this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0],
for (ir::Node *n : node->outputs) { loss_scale, out_dtype);
insert_delayed_op(n->Name(), op_dev_id);
}
} else if (op_dev_id == -1) {
// This op runs on all devices, and its output may have parameter's
// gradients.
sorted_ops.emplace_back(node);
bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
std::vector<std::string> backward_vars;
try {
backward_vars =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
} catch (boost::bad_get e) {
} }
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); }
for (size_t i = 0; i < backward_vars.size(); i += 2) { std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
auto &g_name = backward_vars[i + 1]; const ir::Graph &graph) const {
size_t cur_device_id = GetAppropriateDeviceID({g_name}); return ir::TopologySortOperations(graph);
insert_delayed_op(g_name, static_cast<int>(cur_device_id)); }
}
} else if (op_dev_id == -2) {
// The Op on which the Op depends has not yet been generated.
}
}
PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); bool MultiDevSSAGraphBuilderBase::UseGPU() const {
return sorted_ops; bool use_gpu = false;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
use_gpu = nccl_ctxs_ != nullptr;
#endif
return use_gpu;
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const {
return Get<size_t>(kNRanks) > 1;
}
void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
ir::Node *node, ir::Node *node,
size_t place_id) const { size_t place_id) const {
auto p = places_[place_id]; auto p = places_[place_id];
...@@ -420,28 +300,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, ...@@ -420,28 +300,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
} }
} }
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0;
for (auto var_name : var_names) {
if (all_vars_.find(var_name) == all_vars_.end()) continue;
auto var_desc = all_vars_.at(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GT(numel, 0);
numel_sum += numel;
}
auto smallest =
std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
balance_vars_[dev_id] += numel_sum;
return dev_id;
}
void MultiDevSSAGraphBuilder::SetCommunicationContext(
OpHandleBase *op_handle, const platform::Place &p) const { OpHandleBase *op_handle, const platform::Place &p) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (nccl_ctxs_ == nullptr) { if (nccl_ctxs_ == nullptr) {
...@@ -454,7 +313,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( ...@@ -454,7 +313,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
#endif #endif
} }
void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
const std::string &p_name, const std::string &p_name,
size_t src_dev_id) const { size_t src_dev_id) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...@@ -484,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, ...@@ -484,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
} }
} }
void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
ir::Graph *result, ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const { const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...@@ -522,7 +381,7 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( ...@@ -522,7 +381,7 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
} }
} }
void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
ir::Node *node, ir::Node *node,
int dev_id) const { int dev_id) const {
result->Get<GraphOps>(kGraphOps).emplace_back( result->Get<GraphOps>(kGraphOps).emplace_back(
...@@ -531,8 +390,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, ...@@ -531,8 +390,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
CreateOpHandleIOs(result, node, dev_id); CreateOpHandleIOs(result, node, dev_id);
} }
void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
const std::string &og) const { ir::Graph *result, const std::string &og) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
...@@ -560,101 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, ...@@ -560,101 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
} }
} }
void MultiDevSSAGraphBuilder::InsertDataBalanceOp( void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp(
ir::Graph *result, const std::vector<std::string> &datas) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
#else
result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
local_scopes_, places_));
#endif
auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
for (const std::string &d_name : datas) {
auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
PADDLE_ENFORCE(!vars.empty());
op_handle->AddInput(vars.back());
auto var = new VarHandle(
result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
vars.size(), i, d_name, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
}
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1;
}
if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
return -1;
}
auto param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device);
if (dev_id == -1) {
(*delay_ops)[param_grad[1]].push_back(node);
return -2;
}
return dev_id;
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1;
}
if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
return -1;
}
auto param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device);
PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
node->Op()->Type(), param_grad[0], param_grad[1]);
return dev_id;
}
int MultiDevSSAGraphBuilder::GetVarDeviceID(
const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const {
auto got = sharded_var_device.find(varname);
if (got == sharded_var_device.end()) {
auto pos = varname.find(framework::kNewGradSuffix);
if (pos != std::string::npos) {
got = sharded_var_device.find(varname.substr(0, pos));
}
}
return got == sharded_var_device.end() ? -1 : got->second;
}
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name, ir::Graph *result, const std::string &loss_grad_name,
ir::Node *out_var_node, proto::VarType::Type dtype) const { ir::Node *out_var_node, size_t loss_scale,
proto::VarType::Type dtype) const {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
auto *op_handle = new ScaleLossGradOpHandle( auto *op_handle = new ScaleLossGradOpHandle(
result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype); loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype);
result->Get<GraphOps>(kGraphOps).emplace_back(op_handle); result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale // FIXME: Currently ScaleLossGradOp only use device_count as scale
...@@ -668,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ...@@ -668,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
} }
} }
void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
ir::Node *node, ir::Graph *result, ir::Node *node, size_t num_places) const {
size_t num_places) const {
for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
auto p = places_[scope_idx]; auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx]; auto s = local_scopes_[scope_idx];
...@@ -680,7 +452,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, ...@@ -680,7 +452,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
} }
} }
VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
const std::string &og, const std::string &og,
int dst_dev_id) const { int dst_dev_id) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...@@ -711,51 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, ...@@ -711,51 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var; return var;
} }
int MultiDevSSAGraphBuilder::CreateDistTrainOp( bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
ir::Graph *result, ir::Node *node, return boost::get<int>(
std::unordered_map<std::string, int> *sharded_var_device) const { node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
int op_dev_id = -1; (static_cast<int>(OpRole::kBackward) |
std::vector<std::string> input_var_names; static_cast<int>(OpRole::kLoss)) &&
std::vector<std::string> output_var_names; !loss_var_name_.empty(); // If loss_var is empty. This is test mode
for (ir::Node *input : node->inputs) { }
input_var_names.push_back(input->Name());
bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
const std::string &og) const {
PADDLE_ENFORCE(all_vars_.count(og) != 0);
if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true;
} }
for (ir::Node *output : node->outputs) { return false;
output_var_names.push_back(output->Name()); }
void AllReduceSSAGraphBuilder::InsertCollectiveOp(
ir::Graph *result, const std::string &p_name,
const std::string &g_name) const {
if (IsSparseGradient(g_name)) {
CreateReduceOp(result, g_name, 0);
CreateBroadcastOp(result, g_name, 0);
} else {
CreateAllReduceOp(result, g_name);
} }
}
if (node->Op()->Type() == "split_byref" || int BalanceVarSSAGraphBuilder::GetVarDeviceID(
node->Op()->Type() == "split_selected_rows" || const std::string &varname) const {
node->Op()->Type() == "split_ids") { auto got = sharded_var_device_.find(varname);
// TODO(paddle-dev): getting the first var is not safe. if (got == sharded_var_device_.end()) {
op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); auto pos = varname.find(framework::kNewGradSuffix);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { if (pos != std::string::npos) {
op_dev_id = GetAppropriateDeviceID(input_var_names); got = sharded_var_device_.find(varname.substr(0, pos));
for (auto &varname : input_var_names) {
sharded_var_device->emplace(varname, op_dev_id);
} }
} }
for (auto &varname : output_var_names) { return got == sharded_var_device_.end() ? -1 : got->second;
sharded_var_device->emplace(varname, op_dev_id); }
int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1;
} }
} else if (node->Op()->Type() == "concat") { if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); return -1;
for (auto &varname : output_var_names) { }
sharded_var_device->emplace(varname, op_dev_id); auto param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
int dev_id = GetVarDeviceID(param_grad[1]);
PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
node->Op()->Type(), param_grad[0], param_grad[1]);
return dev_id;
}
size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0;
for (auto var_name : var_names) {
if (all_vars_.find(var_name) == all_vars_.end()) continue;
auto var_desc = all_vars_.at(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GT(numel, 0);
numel_sum += numel;
} }
auto smallest =
std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
balance_vars_[dev_id] += numel_sum;
return dev_id;
}
void BalanceVarSSAGraphBuilder::ResetState() const {
balance_vars_.clear();
sharded_var_device_.clear();
balance_vars_.resize(places_.size(), 0);
}
void ReduceSSAGraphBuilder::Init() const {
MultiDevSSAGraphBuilderBase::Init();
ResetState();
}
void ReduceSSAGraphBuilder::ResetState() const {
BalanceVarSSAGraphBuilder::ResetState();
bcast_var_name_set_.clear();
bcast_var_name_set_.resize(places_.size());
}
void ReduceSSAGraphBuilder::InsertCollectiveOp(
ir::Graph *result, const std::string &p_name,
const std::string &g_name) const {
size_t cur_device_id = GetAppropriateDeviceID({g_name});
CreateReduceOp(result, g_name, cur_device_id);
sharded_var_device_.emplace(g_name, cur_device_id);
bcast_var_name_set_[cur_device_id].emplace(p_name);
}
bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
ir::Node *node) const {
int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node);
if (op_dev_id != -1) {
// This op only runs on one specific device.
CreateComputationalOp(result, node, op_dev_id);
for (ir::Node *n : node->outputs) {
sharded_var_device_.emplace(n->Name(), op_dev_id);
}
return true;
}
return false;
}
void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
if (UseGPU()) {
if (strategy_.fuse_broadcast_op_) {
CreateFusedBroadcastOp(result, bcast_var_name_set_);
} else { } else {
LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
PADDLE_THROW( auto &to_bcast_set = bcast_var_name_set_[dev_id];
"the distribute training related op should be in [split_byref, " for (auto &bcast_name : to_bcast_set) {
"concat]."); CreateBroadcastOp(result, bcast_name, dev_id);
}
}
} }
}
}
PADDLE_ENFORCE(op_dev_id != -1, int ReduceSSAGraphBuilder::GetOpDeviceID(
"can not find right place for distributed op: %s", ir::Node *node,
node->Op()->Type()); std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
return -1;
}
auto param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
int dev_id = GetVarDeviceID(param_grad[1]);
if (dev_id == -1) {
(*delay_ops)[param_grad[1]].push_back(node);
return -2;
}
return dev_id;
}
std::vector<ir::Node *> ReduceSSAGraphBuilder::SortOperations(
const ir::Graph &graph) const {
std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(graph);
return SortForReduceMode(sorted_ops);
}
std::vector<ir::Node *> ReduceSSAGraphBuilder::SortForReduceMode(
const std::vector<ir::Node *> &topo_ops) const {
std::vector<ir::Node *> sorted_ops;
std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op;
sorted_ops.reserve(topo_ops.size());
ResetState();
auto insert_delayed_op = [&](const std::string &var_name, int dev_id) {
sharded_var_device_.emplace(var_name, dev_id);
if (delayed_op.count(var_name)) {
auto &ops = delayed_op.at(var_name);
sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end());
delayed_op.at(var_name).clear();
}
};
for (ir::Node *node : topo_ops) {
int op_dev_id = GetOpDeviceID(node, &delayed_op);
if (op_dev_id > -1) {
// This op only runs on one specific device.
sorted_ops.emplace_back(node);
for (ir::Node *n : node->outputs) {
insert_delayed_op(n->Name(), op_dev_id);
}
} else if (op_dev_id == -1) {
// This op runs on all devices, and its output may have parameter's
// gradients.
sorted_ops.emplace_back(node);
bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
std::vector<std::string> backward_vars;
try {
backward_vars =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
} catch (boost::bad_get e) {
}
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &g_name = backward_vars[i + 1];
size_t cur_device_id = GetAppropriateDeviceID({g_name});
insert_delayed_op(g_name, static_cast<int>(cur_device_id));
}
} else if (op_dev_id == -2) {
// The Op on which the Op depends has not yet been generated.
}
}
PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
ResetState();
return sorted_ops;
}
void DistSSAGraphBuilder::Init() const {
MultiDevSSAGraphBuilderBase::Init();
ResetState();
}
void DistSSAGraphBuilder::ResetState() const {
BalanceVarSSAGraphBuilder::ResetState();
bcast_var_name_set_.clear();
bcast_var_name_set_.resize(places_.size());
}
bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
ir::Node *node) const {
bool insert_op = false;
if (OpHaveRole(*node, OpRole::kRPC)) {
int op_dev_id = CreateRPCOp(result, node);
PADDLE_ENFORCE(op_dev_id != -1,
"Can not schedule the RPC operator to the right place.");
if (node->Op()->Type() == "recv") {
auto recv_vars_attr =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
if (recv_vars_attr[0].find(".block") == std::string::npos) {
bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]);
}
}
insert_op = true;
need_broadcast_var_ = true;
} else if (OpHaveRole(*node, OpRole::kDist)) {
int op_dev_id = CreateDistTrainOp(result, node);
if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
bcast_var_name_set_[op_dev_id].emplace(origin_param_name);
}
insert_op = true;
} else {
int op_dev_id = GetOpDeviceID(node);
if (op_dev_id != -1) { // This op only runs on one specific device.
CreateComputationalOp(result, node, op_dev_id); CreateComputationalOp(result, node, op_dev_id);
return op_dev_id; for (ir::Node *n : node->outputs) {
sharded_var_device_.emplace(n->Name(), op_dev_id);
}
insert_op = true;
}
}
return insert_op;
} }
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
...@@ -774,13 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { ...@@ -774,13 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
} }
// Create RPC related op handles that connects its in ops and out ops. // Create RPC related op handles that connects its in ops and out ops.
int MultiDevSSAGraphBuilder::CreateRPCOp( int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const {
int op_dev_id = -1; int op_dev_id = -1;
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe. // TODO(paddle-dev): getting the first var is not safe.
op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
"This hack no longer holds, please fix."); "This hack no longer holds, please fix.");
// the variable name which contains .block means it was splited by // the variable name which contains .block means it was splited by
...@@ -798,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -798,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
VLOG(10) << "send grad " << input_var_names[0] << " origin " VLOG(10) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id; << send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) { for (auto &varname : input_var_names) {
sharded_var_device->emplace(varname, op_dev_id); sharded_var_device_.emplace(varname, op_dev_id);
} }
sharded_var_device->emplace(send_param_grad[1], op_dev_id); sharded_var_device_.emplace(send_param_grad[1], op_dev_id);
} }
} else if (node->Op()->Type() == "recv") { } else if (node->Op()->Type() == "recv") {
std::vector<std::string> output_var_names; std::vector<std::string> output_var_names;
...@@ -810,7 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -810,7 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
auto recv_param_grad = boost::get<std::vector<std::string>>( auto recv_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
if (recv_param_grad.size() == 2U) { if (recv_param_grad.size() == 2U) {
op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); op_dev_id = GetVarDeviceID(recv_param_grad[1]);
VLOG(10) << "recv param " << recv_param_grad[0] VLOG(10) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1] << " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id; << " place: " << op_dev_id;
...@@ -818,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -818,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
op_dev_id = GetAppropriateDeviceID(output_var_names); op_dev_id = GetAppropriateDeviceID(output_var_names);
} }
for (auto &varname : output_var_names) { for (auto &varname : output_var_names) {
sharded_var_device->emplace(varname, op_dev_id); sharded_var_device_.emplace(varname, op_dev_id);
} }
} else { } else {
// send_barrier, fetch_barrier will run on place 0; // send_barrier, fetch_barrier will run on place 0;
...@@ -845,7 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -845,7 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
for (ir::Node *output : node->outputs) { for (ir::Node *output : node->outputs) {
int outvar_dev_id = op_dev_id; int outvar_dev_id = op_dev_id;
if (node->Op()->Type() == "fetch_barrier") { if (node->Op()->Type() == "fetch_barrier") {
outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); outvar_dev_id = GetVarDeviceID(output->Name());
PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
} }
p = places_[outvar_dev_id]; p = places_[outvar_dev_id];
...@@ -862,29 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -862,29 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
return op_dev_id; return op_dev_id;
} }
bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
PADDLE_ENFORCE(all_vars_.count(og) != 0); ir::Node *node) const {
if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { int op_dev_id = -1;
return true; std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
} }
return false; for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
if (node->Op()->Type() == "split_byref" ||
node->Op()->Type() == "split_selected_rows" ||
node->Op()->Type() == "split_ids") {
// TODO(paddle-dev): getting the first var is not safe.
op_dev_id = GetVarDeviceID(input_var_names[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
op_dev_id = GetAppropriateDeviceID(input_var_names);
for (auto &varname : input_var_names) {
sharded_var_device_.emplace(varname, op_dev_id);
}
}
for (auto &varname : output_var_names) {
sharded_var_device_.emplace(varname, op_dev_id);
}
} else if (node->Op()->Type() == "concat") {
op_dev_id = GetVarDeviceID(input_var_names[0]);
for (auto &varname : output_var_names) {
sharded_var_device_.emplace(varname, op_dev_id);
}
} else {
LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
PADDLE_THROW(
"the distribute training related op should be in [split_byref, "
"concat].");
}
PADDLE_ENFORCE(op_dev_id != -1,
"can not find right place for distributed op: %s",
node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id);
return op_dev_id;
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
return boost::get<int>( const std::string &p_name,
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == const std::string &g_name) const {
(static_cast<int>(OpRole::kBackward) | size_t cur_device_id = 0;
static_cast<int>(OpRole::kLoss)) && switch (strategy_.reduce_) {
!loss_var_name_.empty(); // If loss_var is empty. This is test mode case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = GetAppropriateDeviceID({g_name});
CreateReduceOp(result, g_name, cur_device_id);
sharded_var_device_.emplace(g_name, cur_device_id);
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(g_name)) {
CreateReduceOp(result, g_name, 0);
CreateBroadcastOp(result, g_name, 0);
} else {
CreateAllReduceOp(result, g_name);
}
break;
default:
LOG(FATAL) << "Unknown reduce strategy.";
break;
}
} }
void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
if (need_broadcast_var_ ||
(UseGPU() &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
if (strategy_.fuse_broadcast_op_) {
CreateFusedBroadcastOp(result, bcast_var_name_set_);
} else {
for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set_[dev_id];
for (auto &bcast_name : to_bcast_set) {
CreateBroadcastOp(result, bcast_name, dev_id);
}
}
}
}
}
std::unordered_set<std::string> &MultiDevSSAGraphBuilder() {
static std::unordered_set<std::string> regs;
return regs;
}
static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
MultiDevSSAGraphBuilder().insert(builder_mode);
return 0;
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
REGISTER_PASS(multi_devices_pass, #define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class) \
paddle::framework::details::MultiDevSSAGraphBuilder) STATIC_ASSERT_GLOBAL_NAMESPACE( \
.RequirePassAttr(paddle::framework::details::kLossVarName) _reg_ssa_graph_builder_##pass_name, \
.RequirePassAttr(paddle::framework::details::kPlaces) "REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \
.RequirePassAttr(paddle::framework::details::kLocalScopes) int _reg_ssa_graph_builder_entry_##pass_name = \
.RequirePassAttr(paddle::framework::details::kStrategy) paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \
.RequirePassAttr(paddle::framework::details::kNumTrainers); REGISTER_PASS(pass_name, pass_class) \
.RequirePassAttr(paddle::framework::details::kLossVarName) \
.RequirePassAttr(paddle::framework::details::kPlaces) \
.RequirePassAttr(paddle::framework::details::kLocalScopes) \
.RequirePassAttr(paddle::framework::details::kStrategy) \
.RequirePassAttr(paddle::framework::details::kNRanks)
REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
paddle::framework::details::ReduceSSAGraphBuilder);
REGISTER_MULTI_DEVICES_PASS(
allreduce_mode_multi_devices_pass,
paddle::framework::details::AllReduceSSAGraphBuilder);
REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
paddle::framework::details::DistSSAGraphBuilder);
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -30,78 +31,70 @@ namespace framework { ...@@ -30,78 +31,70 @@ namespace framework {
class Scope; class Scope;
namespace details { namespace details {
class MultiDevSSAGraphBuilder : public ir::Pass { constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override; std::unique_ptr<ir::Graph> graph) const override;
private: virtual void Init() const;
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const;
void Init() const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
int GetVarDeviceID( virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &varname, const std::string &g_name) const = 0;
const std::unordered_map<std::string, int> &sharded_var_device) const;
bool IsScaleLossOp(ir::Node *node) const; virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
int CreateRPCOp( bool UseGPU() const;
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const; bool NeedCollectiveOps() const;
int CreateDistTrainOp(
ir::Graph *result, ir::Node *node, bool IsScaleLossOp(ir::Node *node) const;
std::unordered_map<std::string, int> *sharded_var_device) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result, void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name, const std::string &loss_grad_name,
ir::Node *out_var_node, ir::Node *out_var_node, size_t loss_scale,
proto::VarType::Type dtype) const; proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
void CreateComputationalOp(ir::Graph *result, ir::Node *node, void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const; int dev_id) const;
int GetOpDeviceID( bool IsSparseGradient(const std::string &og) const;
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
void InsertDataBalanceOp(ir::Graph *result, void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
const std::vector<std::string> &datas) const;
void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
void CreateFusedBroadcastOp( void CreateFusedBroadcastOp(
ir::Graph *result, ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const; const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
bool IsSparseGradient(const std::string &og) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode( void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
const std::vector<ir::Node *> &) const; size_t device_id) const;
int GetOpDeviceID( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
ir::Node *node, mutable platform::NCCLContextMap *nccl_ctxs_;
const std::unordered_map<std::string, int> &shared_var_device, #endif
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
...@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable BuildStrategy strategy_; mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_; mutable std::unordered_map<std::string, VarDesc *> all_vars_;
};
class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
return false;
}
virtual void InsertPostprocessOps(ir::Graph *result) const {}
};
class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
int GetVarDeviceID(const std::string &varname) const;
int GetOpDeviceID(ir::Node *node) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
virtual void ResetState() const;
mutable std::unordered_map<std::string, int> sharded_var_device_;
mutable std::vector<int64_t> balance_vars_; mutable std::vector<int64_t> balance_vars_;
}; };
class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
virtual void ResetState() const;
int GetOpDeviceID(ir::Node *node,
std::unordered_map<std::string, std::vector<ir::Node *>>
*delay_ops) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &topo_ops) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
};
class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual void ResetState() const;
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
mutable bool need_broadcast_var_{false};
};
std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
namespace paddle {
namespace framework {
namespace details {
ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::vector<std::unique_ptr<ir::Graph>> &&graphs)
: strategy_(std::move(strategy)),
local_scopes_(std::move(local_scopes)),
pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
places_(std::move(places)),
graphs_(std::move(graphs)) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
// set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
? 1UL
: strategy_.num_threads_ / places_.size();
VLOG(1) << "set num_threads: " << strategy_.num_threads_
<< " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
}
}
FeedFetchList ParallelSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::vector<std::future<FeedFetchList>> run_futures;
std::vector<FeedFetchList> fetch_data;
FeedFetchList ret;
fetch_data.reserve(places_.size());
ret.reserve(fetch_tensors.size());
exception_holder_.Clear();
for (size_t i = 0; i < places_.size(); ++i) {
auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
try {
return executors_[i]->Run(fetch_tensors);
} catch (...) {
exception_holder_.Catch(std::current_exception());
}
return FeedFetchList();
};
if (pool_) {
run_futures.emplace_back(pool_->enqueue(std::move(call)));
} else {
fetch_data.emplace_back(std::move(call()));
}
}
if (pool_) {
for (auto &f : run_futures) {
if (exception_holder_.IsCaught()) {
f.wait();
} else {
fetch_data.emplace_back(std::move(f.get()));
}
}
}
if (exception_holder_.IsCaught()) {
exception_holder_.ReThrow();
}
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.reserve(local_scopes_.size());
for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx));
}
ret.emplace_back();
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
}
return ret;
}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -14,23 +14,36 @@ ...@@ -14,23 +14,36 @@
#pragma once #pragma once
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <string> #include <string>
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
class SSAGraghBuilderWithChecker : public ir::Pass { class ParallelSSAGraphExecutor : public SSAGraphExecutor {
protected: public:
std::unique_ptr<ir::Graph> ApplyImpl( ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
std::unique_ptr<ir::Graph> graph) const override { const std::vector<Scope *> &local_scopes,
PADDLE_ENFORCE(IsValidGraph(graph.get())); const std::vector<platform::Place> &places,
return graph; std::vector<std::unique_ptr<ir::Graph>> &&graphs);
} ~ParallelSSAGraphExecutor() final = default;
const ir::Graph &Graph() const override { return *graphs_[0]; }
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
private:
ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_;
std::unique_ptr<::ThreadPool> pool_{nullptr};
std::vector<platform::Place> places_;
std::vector<std::unique_ptr<ir::Graph>> graphs_;
bool IsValidGraph(const ir::Graph* graph) const; std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
ExceptionHolder exception_holder_;
}; };
} // namespace details } // namespace details
......
...@@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
} }
} }
std::vector<framework::LoDTensor> fetch_data; std::vector<framework::LoDTensor> fetch_data;
std::exception_ptr eptr; std::exception_ptr eptr = nullptr;
try { try {
fetch_data = underlying_executor_->Run(fetch_tensors); fetch_data = underlying_executor_->Run(fetch_tensors);
} catch (...) { } catch (...) {
......
...@@ -40,12 +40,12 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, ...@@ -40,12 +40,12 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
void NaiveExecutor::Run() { void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE #ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the " LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set."; "cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and " LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation " "variables will be reused to save the allocation "
"overhead."; "overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by " LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are " "setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference"; "running Paddle Inference";
#endif // PADDLE_ON_INFERENCE #endif // PADDLE_ON_INFERENCE
......
...@@ -32,8 +32,11 @@ std::map<std::string, ...@@ -32,8 +32,11 @@ std::map<std::string,
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
{"mean", paddle::operators::ngraphs::BuildMeanNode},
{"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
{"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul", paddle::operators::ngraphs::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
{"scale", paddle::operators::ngraphs::BuildScaleNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>}, {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}, {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
{"top_k", paddle::operators::ngraphs::BuildTopKNode}}; {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
......
...@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { ...@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
} }
} }
backend_->call(ngraph_function_, t_out, t_in); backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
} // NgraphEngine::RunImpl } // NgraphEngine::RunImpl
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -377,6 +377,30 @@ class ExecutionContext { ...@@ -377,6 +377,30 @@ class ExecutionContext {
return op_.Outputs(name); return op_.Outputs(name);
} }
template <typename T, typename DevContext>
Tensor AllocateTmpTensor(const framework::DDim& dim,
const DevContext& dev_ctx) const {
auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance()
.Get<DevContext>(dev_ctx)
.Allocate(product(dim) * sizeof(T));
auto& deleter = tmp_allocation_ptr.get_deleter();
auto* allocation_ptr = tmp_allocation_ptr.release();
auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(
framework::ToDataType(std::type_index(typeid(T))));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
}
private: private:
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
......
...@@ -21,12 +21,9 @@ limitations under the License. */ ...@@ -21,12 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
...@@ -38,6 +35,8 @@ limitations under the License. */ ...@@ -38,6 +35,8 @@ limitations under the License. */
DEFINE_string(pe_profile_fname, "", DEFINE_string(pe_profile_fname, "",
"Profiler filename for PE, which generated by gperftools." "Profiler filename for PE, which generated by gperftools."
"Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
DEFINE_bool(enable_parallel_graph, false,
"Force disable parallel graph execution mode if set false.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -106,6 +105,7 @@ class ParallelExecutorPrivate { ...@@ -106,6 +105,7 @@ class ParallelExecutorPrivate {
bool own_local_scope_; bool own_local_scope_;
bool use_cuda_; bool use_cuda_;
bool use_all_reduce_; bool use_all_reduce_;
size_t nranks_;
// global_ref_cnts_ is only initialized when ParallelExecutor constructs, and // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
// then keeps unchanged // then keeps unchanged
...@@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor(
member_->build_strategy_ = build_strategy; member_->build_strategy_ = build_strategy;
member_->use_all_reduce_ = member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
member_->nranks_ = num_trainers * places.size();
if (!member_->use_all_reduce_) { if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1, PADDLE_ENFORCE(places.size() > 1,
...@@ -224,46 +225,80 @@ ParallelExecutor::ParallelExecutor( ...@@ -224,46 +225,80 @@ ParallelExecutor::ParallelExecutor(
} }
} }
// FIXME(Yancey1989): parallel graph mode get better performance
// in GPU allreduce distributed training. Need an elegant way to
// choice the execution strategy.
build_strategy.enable_parallel_graph_ =
EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
VLOG(1) << "Enable ParallelGraph Execution: "
<< build_strategy.enable_parallel_graph_;
if (member_->use_cuda_) { if (member_->use_cuda_) {
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
ncclUniqueId *nccl_id = nullptr; ncclUniqueId *nccl_id = nullptr;
// gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective
// distributed training
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
if (nccl_id_var != nullptr) { if (nccl_id_var != nullptr) {
nccl_id = nccl_id_var->GetMutable<ncclUniqueId>(); nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
} }
if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) {
if (nccl_id == nullptr) {
local_nccl_id_.reset(new ncclUniqueId());
platform::dynload::ncclGetUniqueId(local_nccl_id_.get());
nccl_id = local_nccl_id_.get();
}
}
member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, num_trainers, trainer_id)); member_->places_, nccl_id, num_trainers, trainer_id));
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
BCastParamsToDevices(bcast_vars); BCastParamsToDevices(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
std::vector<std::unique_ptr<ir::Graph>> graphs;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (build_strategy.enable_parallel_graph_) {
for (size_t i = 0; i < member_->places_.size(); ++i) {
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, {member_->places_[i]}, loss_var_name,
{member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
member_->nccl_ctxs_.get());
graphs.push_back(std::move(graph));
}
} else {
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, member_->local_scopes_, main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->use_cuda_, member_->nccl_ctxs_.get()); member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
graphs.push_back(std::move(graph));
}
#else #else
std::unique_ptr<ir::Graph> graph = std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
build_strategy.Apply(main_program, member_->places_, loss_var_name, main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->local_scopes_, member_->use_cuda_); member_->nranks_, member_->use_cuda_);
graphs.push_back(std::move(graph));
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
graph = member_->PrepareGCAndRefCnts(std::move(graph), for (size_t i = 0; i < graphs.size(); ++i) {
static_cast<size_t>(max_memory_size)); graphs[i] = member_->PrepareGCAndRefCnts(
std::move(graphs[i]), static_cast<size_t>(max_memory_size));
}
} }
// Step 3. Create vars in each scope. Passes may also create new vars. // Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars // skip control vars and empty vars
std::vector<details::VariableInfo> var_infos; std::vector<details::VariableInfo> var_infos;
for (auto &graph : graphs) {
for (auto &node : graph->Nodes()) { for (auto &node : graph->Nodes()) {
if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
var_infos.emplace_back(); var_infos.emplace_back();
...@@ -272,14 +307,16 @@ ParallelExecutor::ParallelExecutor( ...@@ -272,14 +307,16 @@ ParallelExecutor::ParallelExecutor(
var_infos.back().persistable_ = node->Var()->Persistable(); var_infos.back().persistable_ = node->Var()->Persistable();
} }
} }
}
// If the loss_var_name is given, the number of graph should be only one. // If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) { if (loss_var_name.size()) {
size_t graph_num = ir::GraphNum(*graph); size_t graph_num = ir::GraphNum(*graphs[0]);
if (graph_num > 1) { if (graph_num > 1) {
LOG(WARNING) LOG(WARNING)
<< "The number of graph should be only one, " << "The number of graph should be only one, "
"but the current graph has " "but the current graph has "
<< ir::GraphNum(*graph) << ir::GraphNum(*graphs[0])
<< " sub_graphs. If you want to see the nodes of the " << " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, " "to specify the output dir. NOTES: if you not do training, "
...@@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor( ...@@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor(
} }
} }
if (build_strategy.enable_parallel_graph_) {
member_->executor_.reset(new details::ParallelSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_,
std::move(graphs)));
} else {
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_, exec_strategy, member_->local_scopes_, member_->places_,
std::move(graph))); std::move(graphs[0])));
} else { } else {
member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_, exec_strategy, member_->local_scopes_, member_->places_,
std::move(graph))); std::move(graphs[0])));
}
} }
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
...@@ -423,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ...@@ -423,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
} }
} }
bool ParallelExecutor::EnableParallelGraphExecution(
const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const {
if (!FLAGS_enable_parallel_graph) return false;
bool enable_parallel_graph = true;
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
for (auto &var_desc : main_program.Block(0).AllVars()) {
if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
enable_parallel_graph = false;
}
}
// TODO(Yancey1989): support pserver mode
for (auto &op_desc : main_program.Block(0).AllOps()) {
if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
enable_parallel_graph = false;
break;
}
}
if (!member_->use_all_reduce_ || !member_->use_cuda_)
enable_parallel_graph = false;
if (build_strategy.enable_sequential_execution_ ||
exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
enable_parallel_graph = false;
return enable_parallel_graph;
}
ParallelExecutor::~ParallelExecutor() { ParallelExecutor::~ParallelExecutor() {
for (auto &p : member_->places_) { for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait(); platform::DeviceContextPool::Instance().Get(p)->Wait();
......
...@@ -28,6 +28,10 @@ limitations under the License. */ ...@@ -28,6 +28,10 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -68,8 +72,14 @@ class ParallelExecutor { ...@@ -68,8 +72,14 @@ class ParallelExecutor {
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
bool EnableParallelGraphExecution(const ProgramDesc &main_program,
const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const;
ParallelExecutorPrivate *member_; ParallelExecutorPrivate *member_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<ncclUniqueId> local_nccl_id_;
#endif
}; };
} // namespace framework } // namespace framework
......
...@@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) { ...@@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
src_ptr, size); src_ptr, size);
} }
template <typename T>
paddle::framework::Tensor GetTensor(
memory::allocation::AllocationPtr temp_allocation_ptr,
const framework::DDim& dim) {
auto& deleter = temp_allocation_ptr.get_deleter();
auto* allocation_ptr = temp_allocation_ptr.release();
auto shared_allocation =
std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(
framework::ToDataType(std::type_index(typeid(T))));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() { ...@@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() {
task = std::move(tasks_.front()); task = std::move(tasks_.front());
tasks_.pop(); tasks_.pop();
} }
// run the task // run the task
task(); task();
} }
......
...@@ -123,8 +123,6 @@ struct Argument { ...@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
......
...@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for (const std::string &pass_name : passes) { for (const std::string &pass_name : passes) {
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
// Set some pass attributes.
if (pass_name == "ir_analysis_pass") {
pass->Set("tensorrt_node_teller",
new SubgraphDetector::NodeInsideSubgraphTeller(
argument->tensorrt_node_teller()));
}
if (pass_name == "graph_viz_pass") { if (pass_name == "graph_viz_pass") {
std::string dot_file_path = std::to_string(pass_num) + "_ir_" + std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
(pre_pass.empty() ? "origin" : pre_pass) + (pre_pass.empty() ? "origin" : pre_pass) +
...@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
} }
if (pass_name == "tensorrt_subgraph_pass") { if (pass_name == "tensorrt_subgraph_pass") {
PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
pass->SetNotOwned("tensorrt_node_teller",
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
set(analysis_deps ${analysis_deps} if (TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "") CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const { std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
auto teller = auto teller = [](const framework::ir::Node *node) {
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller"); if (!node->IsOp() || !node->Op()) return false;
return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller, SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/); Get<int>("min_subgraph_size") /*min subgraph size*/);
...@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters( ...@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS(tensorrt_subgraph_pass, REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass) paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size") .RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size") .RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size"); .RequirePassAttr("min_subgraph_size");
...@@ -27,9 +27,6 @@ namespace analysis { ...@@ -27,9 +27,6 @@ namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) { void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
InitTensorRTAttrs(argument);
}
ApplyIrPasses(argument); ApplyIrPasses(argument);
CollectFusionStatis(argument); CollectFusionStatis(argument);
} }
...@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const { ...@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass"; return "ir-analysis-compose-pass";
} }
void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
LOG(INFO) << "Initing TensorRT pass";
argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"});
if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) {
return true;
} else {
return false;
}
});
}
}
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({ std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass", "ir_graph_build_pass", "ir_analysis_pass",
......
...@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass { ...@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std::string repr() const override; std::string repr() const override;
private: private:
void InitTensorRTAttrs(Argument* argument);
void ApplyIrPasses(Argument* argument); void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument); void CollectFusionStatis(Argument* argument);
......
...@@ -14,86 +14,101 @@ ...@@ -14,86 +14,101 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle_pass_builder.h" // NOLINT #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
PassStrategy *contrib::AnalysisConfig::pass_builder() const { PassStrategy *contrib::AnalysisConfig::pass_builder() const {
PADDLE_ENFORCE( if (!pass_builder_.get()) {
pass_builder_.get(), if (use_gpu_) {
"Should call constructor first, that will init the pass_builder_."); LOG(INFO) << "Create GPU IR passes";
return pass_builder_.get();
}
contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
this->use_gpu = use_gpu;
if (use_gpu) {
pass_builder_.reset(new GpuPassStrategy); pass_builder_.reset(new GpuPassStrategy);
} else { } else {
LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy); pass_builder_.reset(new CpuPassStrategy);
} }
} else if (pass_builder_->use_gpu() ^ use_gpu()) {
LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
"PassBuilder, the flags are "
<< use_gpu() << " " << pass_builder_->use_gpu();
LOG(WARNING) << "Please make them compatible, still use the existing "
"PassBuilder.";
}
return pass_builder_.get();
}
contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
model_dir_ = model_dir;
}
contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
const std::string &params_file) {
prog_file_ = prog_file;
params_file_ = params_file;
} }
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) {
prog_file_ = prog_file_path;
params_file_ = params_file_path;
}
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
#ifdef PADDLE_WITH_CUDA
use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb;
device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with gpu to EnableGpu";
use_gpu_ = false;
#endif
}
void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
// fields from Config #define CP_MEMBER(member__) member__ = other.member__;
model_dir = other.model_dir;
// fields from NativeConfig // Model related.
use_gpu = other.use_gpu; CP_MEMBER(model_dir_);
device = other.device; CP_MEMBER(prog_file_);
fraction_of_gpu_memory = other.fraction_of_gpu_memory; CP_MEMBER(params_file_);
prog_file = other.prog_file; CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
param_file = other.param_file; // params_file_ fields.
specify_input_name = other.specify_input_name; // Gpu releated.
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; CP_MEMBER(use_gpu_);
// fields from this. CP_MEMBER(device_id_);
enable_ir_optim = other.enable_ir_optim; CP_MEMBER(memory_pool_init_size_mb_);
// For mkldnn // TensorRT releated.
use_mkldnn_ = other.use_mkldnn_; CP_MEMBER(use_tensorrt_);
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_);
use_feed_fetch_ops = other.use_feed_fetch_ops; CP_MEMBER(tensorrt_min_subgraph_size_);
use_tensorrt_ = other.use_tensorrt_; // MKLDNN releated.
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; CP_MEMBER(use_mkldnn_);
tensorrt_workspace_size_ = other.tensorrt_workspace_size_; CP_MEMBER(mkldnn_enabled_op_types_);
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_; // Ir related.
CP_MEMBER(enable_ir_optim_);
if (use_gpu) { CP_MEMBER(use_feed_fetch_ops_);
CP_MEMBER(ir_debug_);
CP_MEMBER(specify_input_name_);
CP_MEMBER(cpu_math_library_num_threads_);
CP_MEMBER(serialized_info_cache_);
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder()))); *static_cast<GpuPassStrategy *>(other.pass_builder())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder()))); *static_cast<CpuPassStrategy *>(other.pass_builder())));
} }
}
contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { #undef CP_MEMBER
// fields from Config
model_dir = other.model_dir;
// fields from NativeConfig
use_gpu = other.use_gpu;
device = other.device;
fraction_of_gpu_memory = other.fraction_of_gpu_memory;
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
// For mkldnn
use_mkldnn_ = other.use_mkldnn_;
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
use_feed_fetch_ops = other.use_feed_fetch_ops;
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_);
} }
void contrib::AnalysisConfig::EnableMKLDNN() { void contrib::AnalysisConfig::EnableMKLDNN() {
...@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_ = true; use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; }
// Append after the conv+affine_channel fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); void contrib::AnalysisConfig::Update() {
auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return;
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy);
} else {
pass_builder_.reset(new CpuPassStrategy);
}
if (use_tensorrt_) {
if (!use_gpu_) {
LOG(ERROR)
<< "TensorRT engine is not available when EnableGpu() not actived.";
} else {
// Append after the infer_clean pass.
pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
}
}
if (use_mkldnn_) {
if (!enable_ir_optim_) {
LOG(ERROR)
<< "EnableMKLDNN() only works when IR optimization is enabled.";
}
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMKLDNN();
use_mkldnn_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false;
#endif
}
if (ir_debug_) {
pass_builder()->TurnOnDebug();
}
}
std::string contrib::AnalysisConfig::SerializeInfoCache() {
std::stringstream ss;
ss << use_gpu_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
ss << tensorrt_workspace_size_;
ss << tensorrt_max_batchsize_;
ss << use_mkldnn_;
ss << enable_ir_optim_;
ss << use_feed_fetch_ops_;
ss << ir_debug_;
return ss.str();
}
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
}
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#ifdef PADDLE_WITH_CUDA
// Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool.
size_t gpu_used, gpu_available;
platform::GpuMemoryUsage(&gpu_used, &gpu_available);
double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
float fraction_of_gpu_memory =
static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
return fraction_of_gpu_memory;
#else
return 0.;
#endif
} }
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
const char *param_buffer, const char *param_buffer,
size_t param_buffer_size) { size_t param_buffer_size) {
prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
param_file = std::string(param_buffer, param_buffer + param_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true; model_from_memory_ = true;
} }
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile); DECLARE_bool(profile);
...@@ -59,7 +60,7 @@ bool AnalysisPredictor::Init( ...@@ -59,7 +60,7 @@ bool AnalysisPredictor::Init(
if (FLAGS_profile) { if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'"; LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU; : platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device); platform::EnableProfiler(tracking_device);
} }
...@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
// Optimize the program, and load parameters and modify them in the // Optimize the program, and load parameters and modify them in the
// scope_. // scope_.
// This will change the scope_ address. // This will change the scope_ address.
if (config_.enable_ir_optim) { if (config_.ir_optim()) {
status_ir_optim_enabled_ = true; status_ir_optim_enabled_ = true;
OptimizeInferenceProgram(); OptimizeInferenceProgram();
} else { } else {
...@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
return true; return true;
} }
bool AnalysisPredictor::CreateExecutor() { bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu) { if (config_.use_gpu_) {
status_use_gpu_ = true; status_use_gpu_ = true;
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device_id_);
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
} }
bool AnalysisPredictor::PrepareExecutor() { bool AnalysisPredictor::PrepareExecutor() {
executor_->Prepare(sub_scope_, *inference_program_, 0, executor_->Prepare(sub_scope_, *inference_program_, 0,
config_.use_feed_fetch_ops); config_.use_feed_fetch_ops_);
PADDLE_ENFORCE_NOT_NULL(sub_scope_); PADDLE_ENFORCE_NOT_NULL(sub_scope_);
...@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
} }
input.set_lod(lod); input.set_lod(lod);
int idx = -1; int idx = -1;
if (config_.specify_input_name) { if (config_.specify_input_name_) {
auto name = inputs[i].name; auto name = inputs[i].name;
if (feed_names_.find(name) == feed_names_.end()) { if (feed_names_.find(name) == feed_names_.end()) {
LOG(ERROR) << "feed names from program do not have name: [" << name LOG(ERROR) << "feed names from program do not have name: [" << name
...@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
void AnalysisPredictor::OptimizeInferenceProgram() { void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_ = true; status_program_optimized_ = true;
argument_.SetUseGPU(config_.use_gpu); argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.device); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
if (!config_.model_dir.empty()) { if (!config_.model_dir().empty()) {
argument_.SetModelDir(config_.model_dir); argument_.SetModelDir(config_.model_dir());
} else { } else {
PADDLE_ENFORCE( PADDLE_ENFORCE(
!config_.param_file.empty(), !config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set."); "Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file.empty()); PADDLE_ENFORCE(!config_.prog_file().empty());
argument_.SetModelProgramPath(config_.prog_file); argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.param_file); argument_.SetModelParamsPath(config_.params_file());
} }
if (config_.use_gpu && config_.use_tensorrt_) { if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
argument_.SetUseTensorRT(true); argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
...@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
auto passes = config_.pass_builder()->AllPasses(); auto passes = config_.pass_builder()->AllPasses();
if (!config_.enable_ir_optim) passes.clear(); if (!config_.ir_optim()) passes.clear();
argument_.SetIrAnalysisPasses(passes); argument_.SetIrAnalysisPasses(passes);
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get())); argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
Analyzer().Run(&argument_); Analyzer().Run(&argument_);
...@@ -358,18 +359,26 @@ template <> ...@@ -358,18 +359,26 @@ template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu()) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
config.fraction_of_gpu_memory, 0.f, PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); config.gpu_device_id());
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags; std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) { float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
if (fraction_of_gpu_memory > 0.95f) {
LOG(ERROR)
<< "Allocate too much memory for the GPU memory pool, assigned "
<< config.memory_pool_init_size_mb() << " MB";
LOG(ERROR)
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy"); flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" + std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory); std::to_string(fraction_of_gpu_memory);
flags.push_back(flag); flags.push_back(flag);
VLOG(3) << "set flag: " << flag; VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags); framework::InitGflags(flags);
...@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
bool AnalysisPredictor::LoadProgramDesc() { bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program // Initialize the inference program
std::string filename; std::string filename;
if (!config_.model_dir.empty()) { if (!config_.model_dir().empty()) {
filename = config_.model_dir + "/__model__"; filename = config_.model_dir() + "/__model__";
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) { } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// The file names should be consistent with that used // The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`. // in Python API `fluid.io.save_inference_model`.
filename = config_.prog_file; filename = config_.prog_file();
} else { } else {
if (config_.model_dir.empty() && config_.prog_file.empty()) { if (config_.model_dir().empty() && config_.prog_file().empty()) {
LOG(ERROR) LOG(ERROR)
<< "Either model_dir or (prog_file, param_file) should be set."; << "Either model_dir or (prog_file, param_file) should be set.";
return false; return false;
} }
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s'.", config_.model_dir, "not valid model path '%s' or program path '%s'.", config_.model_dir(),
config_.param_file); config_.params_file());
return false; return false;
} }
...@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() { ...@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
proto.ParseFromString(pb_content); proto.ParseFromString(pb_content);
} else { } else {
proto.ParseFromString(config_.prog_file); proto.ParseFromString(config_.prog_file());
} }
inference_program_.reset(new framework::ProgramDesc(proto)); inference_program_.reset(new framework::ProgramDesc(proto));
return true; return true;
...@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() { ...@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true); new_var->SetPersistable(true);
if (!config_.param_file.empty()) { if (!config_.params_file().empty()) {
params.push_back(new_var->Name()); params.push_back(new_var->Name());
} else { } else {
// append_op // append_op
framework::OpDesc *op = load_block->AppendOp(); framework::OpDesc *op = load_block->AppendOp();
op->SetType("load"); op->SetType("load");
op->SetOutput("Out", {new_var->Name()}); op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()}); op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
op->CheckAttrs(); op->CheckAttrs();
} }
} }
} }
if (!config_.param_file.empty()) { if (!config_.params_file().empty()) {
// sort paramlist to have consistent ordering // sort paramlist to have consistent ordering
std::sort(params.begin(), params.end()); std::sort(params.begin(), params.end());
// append just the load_combine op // append just the load_combine op
framework::OpDesc *op = load_block->AppendOp(); framework::OpDesc *op = load_block->AppendOp();
op->SetType("load_combine"); op->SetType("load_combine");
op->SetOutput("Out", params); op->SetOutput("Out", params);
op->SetAttr("file_path", {config_.param_file}); op->SetAttr("file_path", {config_.params_file()});
op->CheckAttrs(); op->CheckAttrs();
} }
......
...@@ -25,9 +25,9 @@ namespace paddle { ...@@ -25,9 +25,9 @@ namespace paddle {
using contrib::AnalysisConfig; using contrib::AnalysisConfig;
TEST(AnalysisPredictor, analysis_off) { TEST(AnalysisPredictor, analysis_off) {
AnalysisConfig config(false); AnalysisConfig config;
config.model_dir = FLAGS_dirname; config.SetModel(FLAGS_dirname);
config.enable_ir_optim = false; config.SwitchIrOptim(false);
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get()); auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
...@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) { ...@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
} }
TEST(AnalysisPredictor, analysis_on) { TEST(AnalysisPredictor, analysis_on) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
AnalysisConfig config(true); config.EnableUseGpu(100, 0);
config.fraction_of_gpu_memory = 0.15;
#else #else
AnalysisConfig config; config.DisableGpu();
#endif #endif
config.model_dir = FLAGS_dirname;
config.enable_ir_optim = true;
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get()); auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
...@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) { ...@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
} }
// compare with NativePredictor // compare with NativePredictor
auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config); auto naive_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> naive_outputs; std::vector<PaddleTensor> naive_outputs;
ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
ASSERT_EQ(naive_outputs.size(), 1UL); ASSERT_EQ(naive_outputs.size(), 1UL);
...@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) { ...@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
TEST(AnalysisPredictor, ZeroCopy) { TEST(AnalysisPredictor, ZeroCopy) {
AnalysisConfig config; AnalysisConfig config;
config.model_dir = FLAGS_dirname; config.SetModel(FLAGS_dirname);
config.use_feed_fetch_ops = false; config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto w0 = predictor->GetInputTensor("firstw"); auto w0 = predictor->GetInputTensor("firstw");
...@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) { ...@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
TEST(AnalysisPredictor, Clone) { TEST(AnalysisPredictor, Clone) {
AnalysisConfig config; AnalysisConfig config;
config.model_dir = FLAGS_dirname; config.SetModel(FLAGS_dirname);
config.use_feed_fetch_ops = true; config.SwitchUseFeedFetchOps(true);
config.enable_ir_optim = true; config.SwitchIrOptim(true);
std::vector<std::unique_ptr<PaddlePredictor>> predictors; std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor(config)); predictors.emplace_back(CreatePaddlePredictor(config));
......
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#pragma once #pragma once
#define WITH_ANAKIN
#include <vector> #include <vector>
#include "framework/core/net/net.h" #include "framework/core/net/net.h"
......
...@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create NativePaddlePredictor"; VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f, config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
......
...@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) { ...@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
#endif #endif
TEST(PassBuilder, Delete) { TEST(PassBuilder, Delete) {
contrib::AnalysisConfig config(false); contrib::AnalysisConfig config;
config.DisableGpu();
config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
const auto& passes = config.pass_builder()->AllPasses(); const auto& passes = config.pass_builder()->AllPasses();
auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
......
...@@ -36,12 +36,11 @@ namespace demo { ...@@ -36,12 +36,11 @@ namespace demo {
*/ */
void Main() { void Main() {
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::AnalysisConfig config(true); paddle::contrib::AnalysisConfig config;
config.param_file = FLAGS_modeldir + "/__params__"; config.EnableUseGpu(100, 0);
config.prog_file = FLAGS_modeldir + "/__model__"; config.SetModel(FLAGS_modeldir + "/__params__",
config.device = 0; FLAGS_modeldir + "/__model__");
config.EnableTensorRtEngine(); config.EnableTensorRtEngine();
config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor(config); predictor = CreatePaddlePredictor(config);
VLOG(3) << "begin to process data"; VLOG(3) << "begin to process data";
......
...@@ -40,15 +40,14 @@ using contrib::AnalysisConfig; ...@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
*/ */
void Main(bool use_gpu) { void Main(bool use_gpu) {
std::unique_ptr<PaddlePredictor> predictor, analysis_predictor; std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
AnalysisConfig config(use_gpu); AnalysisConfig config;
config.param_file = FLAGS_modeldir + "/__params__"; if (use_gpu) {
config.prog_file = FLAGS_modeldir + "/__model__"; config.EnableUseGpu(100, 0);
config.device = 0;
if (FLAGS_use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself
} }
config.SetModel(FLAGS_modeldir + "/__model__",
FLAGS_modeldir + "/__params__");
predictor = CreatePaddlePredictor<NativeConfig>(config); predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
analysis_predictor = CreatePaddlePredictor(config); analysis_predictor = CreatePaddlePredictor(config);
// Just a single batch of data. // Just a single batch of data.
......
...@@ -34,26 +34,67 @@ class AnalysisPredictor; ...@@ -34,26 +34,67 @@ class AnalysisPredictor;
namespace contrib { namespace contrib {
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig { struct AnalysisConfig {
explicit AnalysisConfig(bool use_gpu = false); AnalysisConfig() = default;
explicit AnalysisConfig(const AnalysisConfig& other); explicit AnalysisConfig(const AnalysisConfig& other);
explicit AnalysisConfig(AnalysisConfig&& other); explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file);
// Model path related.
void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
void SetModel(const std::string& prog_file_path,
const std::string& params_file_path);
void SetProgFile(const std::string& x) { prog_file_ = x; }
void SetParamsFile(const std::string& x) { params_file_ = x; }
const std::string& model_dir() const { return model_dir_; }
const std::string& prog_file() const { return prog_file_; }
const std::string& params_file() const { return params_file_; }
// GPU related.
void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
void DisableGpu();
bool use_gpu() const { return use_gpu_; }
int gpu_device_id() const { return device_id_; }
int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
float fraction_of_gpu_memory_for_pool() const;
// Determine whether to perform graph optimization. // Determine whether to perform graph optimization.
bool enable_ir_optim = true; void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
bool ir_optim() const { return enable_ir_optim_; }
// Get a pass builder for customize the passes in IR analysis phase. void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
PassStrategy* pass_builder() const; bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
// NOT stable yet. void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
bool use_feed_fetch_ops{true}; bool specify_input_name() const { return specify_input_name_; }
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3); int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
void SwitchIrDebug(int x = true) { ir_debug_ = x; }
void EnableMKLDNN(); void EnableMKLDNN();
bool use_mkldnn() const { return use_mkldnn_; } bool mkldnn_enabled() const { return use_mkldnn_; }
// Set and get the number of cpu math library threads.
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_;
}
NativeConfig ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
void SetMKLDNNOp(std::unordered_set<std::string> op_list) { void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
mkldnn_enabled_op_types_ = op_list; mkldnn_enabled_op_types_ = op_list;
} }
...@@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig { ...@@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig {
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
// NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy* pass_builder() const;
protected:
// Update the config.
void Update();
std::string SerializeInfoCache();
protected: protected:
// Model pathes.
std::string model_dir_;
std::string prog_file_;
std::string params_file_;
// GPU releated.
bool use_gpu_{false};
int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
// TensorRT releated.
bool use_tensorrt_{false}; bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here: // For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_; int tensorrt_workspace_size_;
...@@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig { ...@@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig {
// We set this variable to control the minimum number of nodes in the // We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value. // subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_;
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
bool model_from_memory_{false}; bool model_from_memory_{false};
};
// Configurations for Anakin engine. bool enable_ir_optim_{true};
struct AnakinConfig : public PaddlePredictor::Config { bool use_feed_fetch_ops_{true};
enum TargetType { NVGPU = 0, X86 }; bool ir_debug_{false};
int device;
std::string model_file; bool specify_input_name_{false};
int max_batch_size{-1};
TargetType target_type; int cpu_math_library_num_threads_{1};
// A runtime cache, shouldn't be transferred to others.
std::string serialized_info_cache_;
mutable std::unique_ptr<PassStrategy> pass_builder_;
}; };
} // namespace contrib } // namespace contrib
......
...@@ -26,9 +26,8 @@ limitations under the License. */ ...@@ -26,9 +26,8 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle_api.h" // NOLINT
#ifndef WITH_ANAKIN
#include "paddle_analysis_config.h" // NOLINT #include "paddle_analysis_config.h" // NOLINT
#else #include "paddle_api.h" // NOLINT
#ifdef WITH_ANAKIN
#include "paddle_anakin_config.h" // NOLINT #include "paddle_anakin_config.h" // NOLINT
#endif #endif
...@@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder {
// still some CPU kernels running in CPU mode. // still some CPU kernels running in CPU mode.
virtual void EnableMKLDNN() = 0; virtual void EnableMKLDNN() = 0;
bool use_gpu() const { return use_gpu_; }
virtual ~PassStrategy() = default; virtual ~PassStrategy() = default;
protected:
bool use_gpu_{false};
}; };
/* /*
...@@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy {
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
}); });
use_gpu_ = false;
} }
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
...@@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy { ...@@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
}); });
use_gpu_ = true;
} }
GpuPassStrategy(const GpuPassStrategy &other) GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {
use_gpu_ = true;
}
void EnableMKLDNN() override; void EnableMKLDNN() override;
......
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(plugin) add_subdirectory(plugin)
......
...@@ -12,47 +12,38 @@ ...@@ -12,47 +12,38 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #include "paddle/fluid/inference/tensorrt/op_teller.h"
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif
namespace paddle { namespace paddle {
namespace platform { namespace inference {
namespace tensorrt {
class CublasHandleHolder {
public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) {
PADDLE_ENFORCE(
dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
}
#endif
}
~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } // Just tell by the op_types.
struct SimpleOpTypeSetTeller : public Teller {
SimpleOpTypeSetTeller() {}
template <typename Callback> bool operator()(const std::string& op_type,
inline void Call(Callback &&callback) const { const framework::OpDesc& desc) override {
std::lock_guard<std::mutex> guard(mtx_); return teller_set.count(op_type);
callback(handle_);
} }
private: private:
DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); std::unordered_set<std::string> teller_set{
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
cublasHandle_t handle_; "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
mutable std::mutex mtx_; "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"}};
}; };
} // namespace platform bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
for (auto& teller : tellers_) {
if ((*teller)(op_type, desc)) return true;
}
return false;
}
OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
} // namespace tensorrt
} // namespace inference
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct Teller {
virtual bool operator()(const std::string& op_type,
const framework::OpDesc& desc) = 0;
virtual ~Teller() = default;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class OpTeller {
public:
static OpTeller& Global() {
static std::unique_ptr<OpTeller> x(new OpTeller);
return *x;
}
bool Tell(const std::string& op_type, const framework::OpDesc& desc);
private:
OpTeller();
private:
std::vector<std::unique_ptr<Teller>> tellers_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -41,7 +41,7 @@ endfunction() ...@@ -41,7 +41,7 @@ endfunction()
if(NOT APPLE AND WITH_MKLML) if(NOT APPLE AND WITH_MKLML)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
else() else()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that # TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 ...@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM # normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
# small DAM # small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)
# chinese_ner # chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
...@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ...@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
# resnet50 # resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
# mobilenet with depthwise_conv op # mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->param_file = FLAGS_infer_model + "/param"; cfg->SwitchSpecifyInputNames();
cfg->use_gpu = false; cfg->SwitchIrOptim(true);
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { ...@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
buffer_param.size()); buffer_param.size());
} else { } else {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__",
cfg->param_file = FLAGS_infer_model + "/param"; FLAGS_infer_model + "/param");
} }
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -21,12 +21,10 @@ namespace inference { ...@@ -21,12 +21,10 @@ namespace inference {
namespace analysis { namespace analysis {
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params"; cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->prog_file = FLAGS_infer_model + "/model"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchIrOptim();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
} }
......
...@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, ...@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->param_file = FLAGS_infer_model + "/param"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchSpecifyInputNames();
cfg->device = 0; cfg->SwitchIrOptim();
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) { TEST(Analyzer_rnn1, profile) {
contrib::AnalysisConfig cfg(false); contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.fraction_of_gpu_memory = 0.1; cfg.DisableGpu();
cfg.pass_builder()->TurnOnDebug(); cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
TEST(Analyzer_rnn1, ZeroCopy) { TEST(Analyzer_rnn1, ZeroCopy) {
AnalysisConfig config; AnalysisConfig config;
SetConfig(&config); SetConfig(&config);
config.use_feed_fetch_ops = false; config.SwitchUseFeedFetchOps(false);
PaddlePlace place; PaddlePlace place;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
config.use_feed_fetch_ops = true; config.SwitchUseFeedFetchOps(true);
auto native_predictor = CreatePaddlePredictor<NativeConfig>(config); auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. config.SwitchUseFeedFetchOps(
true); // the analysis predictor needs feed/fetch.
auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
#define NEW_TENSOR(name__) \ #define NEW_TENSOR(name__) \
...@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) { ...@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
TEST(Analyzer_rnn1, ZeroCopyMultiThread) { TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config; AnalysisConfig config;
SetConfig(&config); SetConfig(&config);
config.use_feed_fetch_ops = false; config.SwitchUseFeedFetchOps(false);
#define NEW_TENSOR(name__) \ #define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__); auto name__##_tensor = predictor->GetInputTensor(#name__);
......
...@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->param_file = FLAGS_infer_model + "/param"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchSpecifyInputNames();
cfg->device = 0; cfg->SwitchIrOptim();
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) { ...@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params"; cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->prog_file = FLAGS_infer_model + "/model"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchSpecifyInputNames();
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->pass_builder()->TurnOnDebug(); cfg->pass_builder()->TurnOnDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
} }
......
...@@ -47,11 +47,10 @@ struct DataReader { ...@@ -47,11 +47,10 @@ struct DataReader {
}; };
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) { ...@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/__params__"; cfg->SetModel(FLAGS_infer_model + "/__model__",
cfg->prog_file = FLAGS_infer_model + "/__model__"; FLAGS_infer_model + "/__params__");
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchIrDebug();
cfg->enable_ir_optim = true; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
} }
......
...@@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os, ...@@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os,
num_spaces++; num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config); os << *reinterpret_cast<const NativeConfig *>(&config);
if (!config.model_from_memory()) { if (!config.model_from_memory()) {
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
<< "\n";
} else { } else {
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "prog_file and param_file: load from memory \n"; << "prog_file and param_file: load from memory \n";
} }
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n";
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() os << GenSpaces(num_spaces)
<< "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
num_spaces--; num_spaces--;
os << GenSpaces(num_spaces) << "}\n"; os << GenSpaces(num_spaces) << "}\n";
return os; return os;
......
...@@ -328,7 +328,10 @@ void CompareNativeAndAnalysis( ...@@ -328,7 +328,10 @@ void CompareNativeAndAnalysis(
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true); PrintConfig(config, true);
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false); const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config);
auto native_config = analysis_config->ToNativeConfig();
TestOneThreadPrediction(&native_config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
......
...@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config, ...@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
std::string model_dir, bool use_gpu, std::string model_dir, bool use_gpu,
bool use_tensorrt, int batch_size) { bool use_tensorrt, int batch_size) {
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config->prog_file = model_dir + "/" + FLAGS_prog_filename; config->SetModel(model_dir + "/" + FLAGS_prog_filename,
config->param_file = model_dir + "/" + FLAGS_param_filename; model_dir + "/" + FLAGS_param_filename);
} else { } else {
config->model_dir = model_dir; config->SetModel(model_dir);
} }
if (use_gpu) { if (use_gpu) {
config->use_gpu = true; config->EnableUseGpu(100, 0);
config->device = 0;
config->fraction_of_gpu_memory = 0.15;
if (use_tensorrt) { if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size); config->EnableTensorRtEngine(1 << 10, batch_size);
config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug(); config->pass_builder()->TurnOnDebug();
} else { } else {
config->enable_ir_optim = true; config->SwitchIrOptim();
} }
} }
} }
...@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { ...@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
if (use_analysis || use_tensorrt) { if (use_analysis || use_tensorrt) {
contrib::AnalysisConfig config(true); contrib::AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.pass_builder()->TurnOnDebug(); config.pass_builder()->TurnOnDebug();
SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt, SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
FLAGS_batch_size); FLAGS_batch_size);
...@@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) { ...@@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) {
&native_outputs, false); &native_outputs, false);
std::vector<PaddleTensor> analysis_outputs; std::vector<PaddleTensor> analysis_outputs;
contrib::AnalysisConfig analysis_config(true); contrib::AnalysisConfig analysis_config;
analysis_config.EnableUseGpu(50, 0);
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true, SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
use_tensorrt, FLAGS_batch_size); use_tensorrt, FLAGS_batch_size);
TestOneThreadPrediction( TestOneThreadPrediction(
...@@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) { ...@@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) {
TEST(AnalysisPredictor, use_gpu) { TEST(AnalysisPredictor, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
AnalysisConfig config(true); AnalysisConfig config;
config.model_dir = model_dir; config.EnableUseGpu(100, 0);
config.fraction_of_gpu_memory = 0.15; config.SetModel(model_dir);
config.pass_builder()->TurnOnDebug(); config.pass_builder()->TurnOnDebug();
std::vector<std::vector<PaddleTensor>> inputs_all; std::vector<std::vector<PaddleTensor>> inputs_all;
......
...@@ -53,7 +53,7 @@ if (WITH_GPU) ...@@ -53,7 +53,7 @@ if (WITH_GPU)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
# conv_fusion_op needs cudnn 7 above # conv_fusion_op needs cudnn 7 above
if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7) if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
op_library(conv_fusion_op) op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
endif() endif()
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <unordered_map>
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
...@@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, ...@@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
} }
} }
template <typename T> template <typename T, typename K>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
bool is_INT8 =
std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
if (!is_INT8) {
ComputeFP32(ctx);
} else {
ComputeINT8(ctx);
}
}
void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx = auto& dev_ctx =
...@@ -274,6 +284,271 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -274,6 +284,271 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
} }
void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
input->format() != memory::format::format_undef,
"Wrong layout/format set for Input tensor");
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
"Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
if (bias) {
PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
bias->format() != memory::format::format_undef,
"Wrong layout/format set for Bias tensor");
PADDLE_ENFORCE(bias->dims().size() == 1,
"Bias must only have 1 dimension, i.e. X");
}
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
bool is_conv3d = strides.size() == 3U;
// TODO(tpatejko): add support for dilation
PADDLE_ENFORCE(
is_conv3d
? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
dilations[2] == 1
: dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently");
const T* input_data = input->data<T>();
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<uint8_t>::DataType)
: paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<int8_t>::DataType);
if (force_fp32_output) {
dst_dt = paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<float>::DataType);
}
// Get unique name for storing MKLDNN primitives
std::string key;
key.reserve(MaxKeyLength);
platform::ConvMKLDNNHandler::AppendKey(
&key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
input->format(), dst_dt, ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd";
std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> dst_memory_p = nullptr;
std::vector<primitive> pipeline;
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
nullptr;
std::shared_ptr<platform::ConvMKLDNNHandler> handler = nullptr;
auto prim_key = key + "@conv_p";
auto dst_key = key + "@dst_mem_p";
auto src_key = key + "@src_mem_p";
auto user_src_key = key + "@user_src_mem_p";
auto src_reorder_key = key + "@src_mem_preorder_p";
conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx.GetBlob(prim_key));
if (conv_p == nullptr || !is_test) {
const K* filter_data = filter->data<K>();
auto scale_in_data = ctx.Attr<float>("Scale_in");
auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
auto scale_out_data =
force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
bool is_multi_channel = scale_weights_data.size() > 1;
int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0]
: (weights_tz)[0])
: 1;
std::vector<float> output_shift_scale(count);
#pragma omp parallel for if (count > 1)
for (int i = 0; i < count; i++) {
if (scale_weights_data[i] == 0.0)
output_shift_scale[i] =
scale_out_data; // weights data will contain 0
// in some models, then weights
// scale couldn't be calculated
else
output_shift_scale[i] =
scale_out_data / (scale_in_data * scale_weights_data[i]);
}
auto user_src_md =
platform::MKLDNNMemDesc({src_tz}, src_dt, input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<K>(),
((g) == 1) ? mkldnn::memory::format::oihw
: mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
std::vector<int> bias_tz;
auto src_md =
platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_md =
platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, output_shift_scale, is_test);
} else {
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
paddings, mkldnn_engine, fuse_relu,
output_shift_scale, is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
mkldnn_engine, key));
// create mkldnn memory from input tensors (data/weights)
user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<K>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder =
is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0;
weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
mask_reorder);
if (!force_fp32_output) {
if (fuse_relu) {
dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
} else {
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
}
} else {
dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler);
}
// create convolution op primitive
auto scale_bias_key = key + "@scale_bias";
if (bias) {
const float* bias_data = bias->data<float>();
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
auto user_bias_memory_p = handler->AcquireBiasMemory(
user_bias_md, to_void_cast<float>(bias_data));
std::shared_ptr<mkldnn::memory> bias_memory_p;
int mask_reorder = is_multi_channel ? 1 << 0 : 1;
int count =
is_multi_channel
? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
: 1;
std::vector<float> scale_bias_data(count);
#pragma omp parallel for if (count > 1)
for (int i = 0; i < count; i++) {
scale_bias_data[i] = scale_in_data * scale_weights_data[i];
}
bias_memory_p = handler->AcquireBiasMemoryFromPrimitive(
user_bias_memory_p, pipeline, is_test, true, scale_bias_data,
mask_reorder);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
}
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p);
} else {
auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(src_reorder_key));
src_memory_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
if (src_memory_reorder_p) {
user_src_memory_p = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(user_src_key));
user_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
} else if (src_memory_p) {
src_memory_p->set_data_handle(to_void_cast<T>(input_data));
}
dst_memory_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
conv_pd =
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
dev_ctx.GetBlob(key_conv_pd));
if (conv_pd) {
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
mkldnn_engine, key));
}
if (!force_fp32_output) {
if (fuse_relu) {
dst_memory_p =
platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler);
} else {
dst_memory_p =
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
}
} else {
dst_memory_p =
platform::SetDstMemoryHandler<float>(ctx, output, handler);
}
if (src_memory_reorder_p) {
pipeline.push_back(*src_memory_reorder_p);
}
pipeline.push_back(*conv_p);
}
// push primitive to stream and wait until it's executed
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
}
private: private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu, mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
...@@ -301,6 +576,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -301,6 +576,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return conv_attr; return conv_attr;
} }
mkldnn::primitive_attr CreatePostOps(
bool fuse_relu, const std::vector<float> output_shift_scale) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
conv_attr.set_output_scales(mask, output_shift_scale);
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 1.0f; // beta
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
...@@ -325,6 +617,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -325,6 +617,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
p_conv_pd); p_conv_pd);
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu,
const std::vector<float> output_shift_scale,
bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto conv_desc = mkldnn::convolution_forward::desc(
propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, output_shift_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst, const memory::desc& bias, const memory::desc& dst,
...@@ -349,6 +668,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -349,6 +668,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>( return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd); p_conv_pd);
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu,
const std::vector<float> output_shift_scale,
bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto conv_desc = mkldnn::convolution_forward::desc(
propagation, mkldnn::convolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, output_shift_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
}; };
template <typename T> template <typename T>
...@@ -555,7 +902,17 @@ namespace ops = paddle::operators; ...@@ -555,7 +902,17 @@ namespace ops = paddle::operators;
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32, ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>); ops::ConvMKLDNNOpKernel<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, U8,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<uint8_t, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, S8,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<int8_t, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
...@@ -565,7 +922,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ...@@ -565,7 +922,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32, ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>); ops::ConvMKLDNNOpKernel<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
......
...@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
#endif #endif
auto input_data_type = ctx.Input<Tensor>("Input")->type(); auto input_data_type = ctx.Input<Tensor>("Input")->type();
if (input_data_type != framework::proto::VarType::INT8 &&
input_data_type != framework::proto::VarType::UINT8) {
auto filter_data_type = ctx.Input<Tensor>("Filter")->type(); auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
"input and filter data type should be consistent"); "input and filter data type should be consistent");
}
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used"); "float16 can only be used when CUDNN is used");
...@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { ...@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
"connection.") "connection.")
.SetDefault(false); .SetDefault(false);
AddAttr<float>("Scale_in",
"Scale_in to be used for int8 input data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Scale_out",
"Scale_out to be used for int8 output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Scale_in_eltwise",
"Scale_in_eltwise to be used for int8 eltwise input data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { ...@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
"Defaults to \"NHWC\". Specify the data format of the output data, " "Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ") "the input will be transformed automatically. ")
.SetDefault("AnyLayout"); .SetDefault("AnyLayout");
AddAttr<bool>("force_fp32_output",
"(bool, default false) Only used in mkldnn INT8 kernel")
.SetDefault(false);
// TODO(dzhwinter): need to registered layout transform function // TODO(dzhwinter): need to registered layout transform function
AddAttr<int>("workspace_size_MB", AddAttr<int>("workspace_size_MB",
"Only used in cudnn kernel. workspace size for cudnn, in MB, " "Only used in cudnn kernel. workspace size for cudnn, in MB, "
......
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
...@@ -30,6 +29,7 @@ namespace operators { ...@@ -30,6 +29,7 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNFP32 = 1;
constexpr int kConvMKLDNNINT8 = 2; constexpr int kConvMKLDNNINT8 = 2;
constexpr int MaxKeyLength = 256;
// Base convolution operator definations for other conv // Base convolution operator definations for other conv
// like operators to reuse the implementation. // like operators to reuse the implementation.
...@@ -158,10 +158,7 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -158,10 +158,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
// to call the matrix multiplication interface. // to call the matrix multiplication interface.
Tensor col_matrix; Tensor col_matrix;
if (is_expand) { if (is_expand) {
auto tmp_allocation_ptr = col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T));
col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
...@@ -293,10 +290,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -293,10 +290,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
// to call the matrix multiplication interface. // to call the matrix multiplication interface.
Tensor col_matrix; Tensor col_matrix;
if (is_expand) { if (is_expand) {
auto tmp_allocation_ptr = col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T));
col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
......
...@@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
// blockx is multiple of 32. // blockx is multiple of 32.
int blockx = std::min( int blockx = std::min(
static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5), static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
512L); static_cast<int64_t>(512L));
int gridx = (feature_width * num_priors + blockx - 1) / blockx; int gridx = (feature_width * num_priors + blockx - 1) / blockx;
dim3 threads(blockx, 1); dim3 threads(blockx, 1);
dim3 grids(gridx, feature_height); dim3 grids(gridx, feature_height);
......
...@@ -32,7 +32,7 @@ namespace paddle { ...@@ -32,7 +32,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows; using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim; using DDim = framework::DDim;
...@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection( ...@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection(
auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>(); auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
auto* out_tensor = auto* out_tensor =
scope->FindVar(out_name)->GetMutable<framework::LoDTensor>(); scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE_GT(
out_tensor->numel(), 0,
"When calling this method, the LoDTensor's numel must larger than zero. "
"Please check LoDTensor::Resize has been called first.");
auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place()); auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
bool is_on_cpu_place = true; bool is_on_cpu_place = true;
...@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( ...@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection(
auto row_numel = dims[1]; auto row_numel = dims[1];
for (size_t i = 0; i < dims[0]; ++i) { for (int64_t i = 0; i < dims[0]; ++i) {
auto id = ids_in_this_section[i]; auto id = ids_in_this_section[i];
auto origin_id = id + abs_sections[section_idx]; auto origin_id = id + abs_sections[section_idx];
auto& offsets = id_to_offset[origin_id]; auto& offsets = id_to_offset[origin_id];
...@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
const std::vector<std::string>& table_names, const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap, const std::vector<std::string>& epmap,
const std::vector<int>& height_sections, const std::vector<int>& height_sections,
const framework::ExecutionContext& context) { const framework::ExecutionContext& context,
auto& local_scope = context.scope().NewScope(); const framework::Scope& scope) {
auto& local_scope = scope.NewScope();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& cpu_ctx = *pool.Get(platform::CPUPlace()); auto& cpu_ctx = *pool.Get(platform::CPUPlace());
...@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name,
out_var_names.push_back(out_name + "@" + epmap[i]); out_var_names.push_back(out_name + "@" + epmap[i]);
} }
auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>(); auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
std::vector<int64_t> ids_vector; std::vector<int64_t> ids_vector;
if (platform::is_cpu_place(id_tensor.place())) { if (platform::is_cpu_place(id_tensor.place())) {
auto* id_data = id_tensor.data<int64_t>(); auto* id_data = id_tensor.data<int64_t>();
for (size_t i = 0; i < id_tensor.numel(); ++i) { for (int64_t i = 0; i < id_tensor.numel(); ++i) {
ids_vector.push_back(id_data[i]); ids_vector.push_back(id_data[i]);
} }
} else { } else {
...@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
PADDLE_THROW("paddle is not compiled with CUDA!"); PADDLE_THROW("paddle is not compiled with CUDA!");
#else #else
auto cpu_place = platform::CPUPlace(); auto cpu_place = platform::CPUPlace();
framework::Tensor cpu_tensor; framework::LoDTensor cpu_tensor;
auto* cpu_tensor_data = auto* cpu_tensor_data =
cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place); cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
auto stream = auto stream =
...@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
out_var_names, height_sections, splited_ids, out_var_names, height_sections, splited_ids,
context, &local_scope, &actual_ctx); context, &local_scope, &actual_ctx);
scope.DeleteScope(&local_scope);
context.scope().DeleteScope(&local_scope);
} }
}; // namespace distributed }; // namespace distributed
......
...@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name,
const std::vector<std::string>& table_names, const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap, const std::vector<std::string>& epmap,
const std::vector<int>& height_sections, const std::vector<int>& height_sections,
const framework::ExecutionContext& context); const framework::ExecutionContext& context,
const framework::Scope& scope);
template <typename T>
void prefetch_with_reconstruct(const std::string& id_name,
const std::string& out_name,
const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap,
const std::vector<int>& height_sections,
const framework::ExecutionContext& context,
const framework::Scope& scope,
framework::LoDTensor* original) {
prefetch(id_name, out_name, table_names, epmap, height_sections, context,
scope);
auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
auto* original_value = original->data<T>();
auto* out_value = out.data<T>();
size_t original_width = original->numel() / original->dims()[0];
bool is_on_cpu_place = true;
if (!platform::is_cpu_place(ids.place())) {
is_on_cpu_place = false;
}
if (is_on_cpu_place) {
for (int64_t i = 0; i < ids.numel(); i++) {
const T* out_rows = out_value + original_width * i;
T* original_row =
original_value + original_width * ids.data<int64_t>()[i];
std::memcpy(original_row, out_rows, original_width * sizeof(T));
}
} else {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("paddle is not compiled with CUDA!");
#else
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& actual_ctx = *pool.Get(context.GetPlace());
for (int64_t i = 0; i < ids.numel(); i++) {
const T* out_rows = out_value + original_width * i;
T* original_row =
original_value + original_width * ids.data<int64_t>()[i];
auto stream =
static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
platform::CPUPlace(), out_rows, original_width * sizeof(T),
stream);
}
#endif
}
}
}; // namespace distributed }; // namespace distributed
}; // namespace operators }; // namespace operators
......
...@@ -2,7 +2,9 @@ include(operators) ...@@ -2,7 +2,9 @@ include(operators)
register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op) register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
if (WITH_GPU) if (WITH_GPU)
op_library(fusion_transpose_flatten_concat_op) op_library(fusion_transpose_flatten_concat_op)
op_library(fusion_conv_inception_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
op_library(fusion_conv_inception_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
endif()
endif() endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace paddle {
namespace operators {
class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("W"),
"Input W of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"),
"Input Ids of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output of FusedEmbeddingSeqPoolOp should not be null.");
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
const std::string& combiner = ctx->Attrs().Get<std::string>("combiner");
PADDLE_ENFORCE_EQ(table_dims.size(), 2);
PADDLE_ENFORCE_GE(ids_dims.size(), 1,
"The dim size of the 'Ids' tensor must greater than 1.");
PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1,
"The last dimension of the 'Ids' tensor must be 1.");
// we only support sum now
PADDLE_ENFORCE_EQ(combiner, "sum");
int64_t last_dim = table_dims[1];
for (int i = 1; i != ids_dims.size(); ++i) {
last_dim *= ids_dims[i];
}
if (ctx->IsRuntime()) {
framework::Variable* ids_var =
boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Ids")[0]);
const auto& ids_lod = ids_var->Get<LoDTensor>().lod();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE(ids_lod.size(), 1u,
"The LoD level of Input(Ids) must be 1");
PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
int64_t batch_size = ids_lod[0].size() - 1;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size, embedding_size]
ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim}));
} else {
// in compile time, the lod level of ids must be 1
framework::VarDesc* ids_desc =
boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Ids")[0]);
PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1);
// in compile time, the shape from Ids -> output
// should be [-1, 1] -> [-1, embedding_size]
ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim}));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("W",
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter.");
AddInput("Ids",
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"The last dimension size must be 1.");
AddOutput("Out", "The lookup results, which have the same type as W.");
AddAttr<std::string>("combiner",
"(string, default sum) "
"A string specifying the reduction op. Currently sum "
"are supported, sum computes the weighted sum of the "
"embedding results for each row.")
.SetDefault("sum");
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr<bool>("grad_inplace",
"(boolean, default false) "
"If the grad op reuse the input's variable.")
.SetDefault(false);
AddAttr<bool>("is_sparse",
"(boolean, default false) "
"Sparse update.")
.SetDefault(false);
AddComment(R"DOC(
FusedEmbeddingSeqPool Operator.
Computes embeddings for the given ids and weights.
This operator is used to perform lookups on the parameter W,
then computes the weighted sum of the lookups results for each row
and concatenated into a dense tensor.
The input Ids should carry the LoD (Level of Details) information.
And the output will change the LoD information with input Ids.
)DOC");
}
};
class FusedEmbeddingSeqPoolOpGradDescMaker
: public framework::DefaultGradOpDescMaker<true> {
using ::paddle::framework::DefaultGradOpDescMaker<
true>::DefaultGradOpDescMaker;
protected:
virtual std::string GradOpType() const {
return "fused_embedding_seq_pool_grad";
}
};
class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto table_dims = ctx->GetInputDim("W");
ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class FusedEmbeddingSeqPoolOpGradVarTypeInference
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr);
if (is_sparse) {
VLOG(3) << "fused_embedding_seq_pool_grad op "
<< framework::GradVarName("W") << " is set to SelectedRows";
block->Var(out_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
VLOG(3) << "fused_embedding_seq_pool_grad op "
<< framework::GradVarName("W") << " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
}
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp,
ops::FusedEmbeddingSeqPoolOpGradDescMaker,
ops::FusedEmbeddingSeqPoolOpMaker);
REGISTER_OPERATOR(fused_embedding_seq_pool_grad,
ops::FusedEmbeddingSeqPoolOpGrad,
ops::FusedEmbeddingSeqPoolOpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool,
ops::FusedEmbeddingSeqPoolKernel<float>,
ops::FusedEmbeddingSeqPoolKernel<double>);
REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad,
ops::FusedEmbeddingSeqPoolGradKernel<float>,
ops::FusedEmbeddingSeqPoolGradKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim;
template <typename T>
struct EmbeddingVSumFunctor {
void operator()(const framework::ExecutionContext &context,
const LoDTensor *table_t, const LoDTensor *ids_t,
LoDTensor *output_t) {
auto *table = table_t->data<T>();
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
int64_t last_dim = output_t->dims()[1];
const int64_t *ids = ids_t->data<int64_t>();
auto ids_lod = ids_t->lod()[0];
int64_t ids_count = ids_t->numel() / ids_lod.back();
auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
size_t begin = ids_lod[i] * ids_count;
for (int64_t j = 0; j != ids_count; ++j) {
PADDLE_ENFORCE_LT(ids[begin], row_number);
PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
blas.VCOPY(row_width, table + ids[begin + j] * row_width,
output + i * last_dim + j * row_width);
}
for (int64_t r = (ids_lod[i] + 1) * ids_count;
r < ids_lod[i + 1] * ids_count; ++r) {
PADDLE_ENFORCE_LT(ids[r], row_number);
PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
blas.AXPY(row_width, 1., table + ids[r] * row_width,
output + i * last_dim + (r % ids_count) * row_width);
}
}
}
};
template <typename T>
class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const LoDTensor *ids_t = context.Input<LoDTensor>("Ids"); // int tensor
LoDTensor *output_t = context.Output<LoDTensor>("Out"); // float tensor
const LoDTensor *table_var = context.Input<LoDTensor>("W");
const std::string &combiner_type = context.Attr<std::string>("combiner");
if (combiner_type == "sum") {
EmbeddingVSumFunctor<T> functor;
functor(context, table_var, ids_t, output_t);
}
}
};
template <typename T>
class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *table_var = context.InputVar("W");
DDim table_dim;
if (table_var->IsType<LoDTensor>()) {
table_dim = context.Input<LoDTensor>("W")->dims();
} else if (table_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
}
bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
auto lod = ids->lod()[0];
int64_t row_width = d_output->dims()[1];
framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
new_rows->resize(ids_num);
std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t));
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
const T *d_output_data = d_output->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
int64_t in_offset = lod[i] * row_width;
const T *out_pos = d_output_data + i * row_width;
T *in_pos = d_table_data + in_offset;
for (int r = 0; r != h; ++r) {
blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
}
}
} else {
LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
}
}
};
} // namespace operators
} // namespace paddle
...@@ -21,7 +21,7 @@ DECLARE_uint64(conv_workspace_size_limit); ...@@ -21,7 +21,7 @@ DECLARE_uint64(conv_workspace_size_limit);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7100
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
...@@ -264,7 +264,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -264,7 +264,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7100
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion, REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
ops::CUDNNConvInceptionFusionOpKernel<float>, ops::CUDNNConvInceptionFusionOpKernel<float>,
......
...@@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { ...@@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("PreOut"), PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
"Output(PreOut) should not be null."); "Output(PreOut) should not be null.");
auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
if (with_prefetch) {
PADDLE_ENFORCE(ctx->HasOutput("W_Out"),
"Output(W_Out) should not be null.");
}
const int64_t batch_size = ctx->GetInputDim("X")[0]; const int64_t batch_size = ctx->GetInputDim("X")[0];
std::vector<int64_t> output_shape({batch_size, 1}); std::vector<int64_t> output_shape({batch_size, 1});
ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
...@@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"(LoDTensor, required), The labels of training data. It's a" "(LoDTensor, required), The labels of training data. It's a"
"tensor with shape [N, 1]."); "tensor with shape [N, 1].");
AddInput("PTable", AddInput("PathTable",
"(LoDTensor, optional), The Path Table from root to current word" "(LoDTensor, optional), The Path Table from root to current word"
"it should have shape like [N, L], L is the length of the Path") "it should have shape like [N, L], L is the length of the Path")
.AsDispensable(); .AsDispensable();
...@@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
"[batch_size, code_length], where code_length represents the " "[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes.") "maximum path length from root to leaf nodes.")
.AsIntermediate(); .AsIntermediate();
AddOutput(
"W_Out",
"(LoDTensor, optinal) using input 'W' as Output to make it mutable"
"When we are using prefetch")
.AsIntermediate();
AddAttr<AttrType>("num_classes", "(int, optional), The number of classes") AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
.SetDefault(2); .SetDefault(2);
// for parameter prefetch
AddAttr<bool>("remote_prefetch", "").SetDefault(false);
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<int>>("height_sections",
"Height for each output SelectedRows.")
.SetDefault(std::vector<int>({}));
AddAttr<std::vector<std::string>>(
"epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"table_names",
"(string vector, the splited table names that will be fetched from "
"parameter server)"
"in the order of input variables for mapping")
.SetDefault({});
AddComment(R"DOC( AddComment(R"DOC(
The hierarchical sigmoid operator organize the classes into a binary tree. The hierarchical sigmoid operator organize the classes into a binary tree.
At each node, a sigmoid function is used to calculate the probability of At each node, a sigmoid function is used to calculate the probability of
...@@ -189,24 +216,18 @@ class HierarchicalSigmoidGradOpGradVarTypeInference ...@@ -189,24 +216,18 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(w_grad_var_name) block->Var(w_grad_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS); ->SetType(framework::proto::VarType::SELECTED_ROWS);
if (hasBias) {
VLOG(30) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to SelectedRows";
block->Var(bias_grad_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
}
} else { } else {
VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(w_grad_var_name) block->Var(w_grad_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR); ->SetType(framework::proto::VarType::LOD_TENSOR);
}
if (hasBias) { if (hasBias) {
VLOG(30) << "hierarchical_sigmoid_grad op " VLOG(30) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to LoDTensor"; << framework::GradVarName("Bias") << " is set to LoDTensor";
block->Var(bias_grad_var_name) block->Var(bias_grad_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR); ->SetType(framework::proto::VarType::LOD_TENSOR);
} }
}
block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
} }
}; };
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <iterator>
#include <set> #include <set>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -24,6 +26,10 @@ limitations under the License. */ ...@@ -24,6 +26,10 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -34,8 +40,9 @@ using platform::Transform; ...@@ -34,8 +40,9 @@ using platform::Transform;
static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) { static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
std::set<int64_t> rows; std::set<int64_t> rows;
const int64_t* paths = path.data<int64_t>();
for (int64_t i = 0; i < path.numel(); ++i) { for (int64_t i = 0; i < path.numel(); ++i) {
int64_t row = path.data<int64_t>()[i]; int64_t row = paths[i];
if (row < 0) { if (row < 0) {
continue; continue;
} }
...@@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> { ...@@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X")); auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W")); auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
auto* path = ctx.Input<framework::LoDTensor>("PTable"); auto* path = ctx.Input<framework::LoDTensor>("PathTable");
auto* code = ctx.Input<framework::LoDTensor>("PathCode"); auto* code = ctx.Input<framework::LoDTensor>("PathCode");
auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label")); auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
auto* bias = ctx.Input<framework::LoDTensor>("Bias"); auto* bias = ctx.Input<framework::LoDTensor>("Bias");
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut"); auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes")); size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
// for remote prefetch
auto epmap = ctx.Attr<std::vector<std::string>>("epmap");
if (!epmap.empty()) {
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
auto table_names = ctx.Attr<std::vector<std::string>>("table_names");
std::vector<int64_t> real_rows = PathToRows(*path);
framework::Scope& local_scope = ctx.scope().NewScope();
auto* ids = local_scope.Var("Ids@Prefetch");
auto* x_tensor = ids->GetMutable<framework::LoDTensor>();
x_tensor->mutable_data<int64_t>(
framework::make_ddim({static_cast<int64_t>(real_rows.size()), 1}),
ctx.GetPlace());
// copy.
std::memcpy(x_tensor->data<int64_t>(), real_rows.data(),
real_rows.size() * sizeof(int64_t));
framework::DDim w_dims = ctx.Input<Tensor>("W")->dims();
w_dims[0] = x_tensor->dims()[0];
auto* w_tensor =
local_scope.Var("W@Prefetch")->GetMutable<framework::LoDTensor>();
w_tensor->Resize(w_dims);
#ifdef PADDLE_WITH_DISTRIBUTE
// w_Out is set to used by prefetch, never change it in other cases
auto* w_out = ctx.Output<framework::LoDTensor>("W_Out");
operators::distributed::prefetch_with_reconstruct<T>(
"Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections,
ctx, local_scope, w_out);
#else
PADDLE_THROW(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!");
#endif
}
bool is_custom = false; bool is_custom = false;
if (path) { if (path) {
is_custom = true; is_custom = true;
...@@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X")); auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W")); auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
auto* path = ctx.Input<framework::LoDTensor>("PTable"); auto* path = ctx.Input<framework::LoDTensor>("PathTable");
auto* code = ctx.Input<framework::LoDTensor>("PathCode"); auto* code = ctx.Input<framework::LoDTensor>("PathCode");
auto* bias = ctx.Input<framework::LoDTensor>("Bias");
auto* in_grad = auto* in_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("X")); ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
bool is_sparse = ctx.Attr<bool>("is_sparse"); bool is_sparse = ctx.Attr<bool>("is_sparse");
...@@ -173,8 +220,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -173,8 +220,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
} }
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward. // be consistent with the clipping in forward.
if (!is_sparse) {
auto* bias_grad = auto* bias_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias")); ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
if (bias_grad) { if (bias_grad) {
...@@ -182,6 +227,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -182,6 +227,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
zero(dev_ctx, bias_grad, static_cast<T>(0.0)); zero(dev_ctx, bias_grad, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad); bit_code->AddGrad(pre_out_grad, bias_grad);
} }
if (!is_sparse) {
auto* w_grad = auto* w_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("W")); ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
w_grad->mutable_data<T>(ctx.GetPlace()); w_grad->mutable_data<T>(ctx.GetPlace());
...@@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace()); w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
zero(dev_ctx, w_grad_value, static_cast<T>(0.0)); zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
auto* bias_grad =
ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
if (bias_grad) {
bias_grad->set_rows(real_rows);
// build ids -> rows index map
bias_grad->SyncIndex();
bias_grad->set_height(bias->dims()[0]);
auto* bias_grad_value = bias_grad->mutable_value();
std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
bias->dims()[1]};
bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
ctx.GetPlace());
zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
bit_code->MulGradWeight(pre_out_grad, w_grad, in); bit_code->MulGradWeight(pre_out_grad, w_grad, in);
} }
bit_code->MulGradError(pre_out_grad, w, in_grad); bit_code->MulGradError(pre_out_grad, w, in_grad);
......
...@@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel<T> { ...@@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
out0->mutable_data<T>(context.GetPlace()); out0->mutable_data<T>(context.GetPlace());
auto x_grad = EigenVector<T>::Flatten(*out0); auto x_grad = EigenVector<T>::Flatten(*out0);
x_grad.device(place) = x_grad.device(place) =
out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0)); residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
x_grad.device(place) = out_grad * x_grad;
} }
if (out1) { if (out1) {
out1->mutable_data<T>(context.GetPlace()); out1->mutable_data<T>(context.GetPlace());
auto y_grad = EigenVector<T>::Flatten(*out1); auto y_grad = EigenVector<T>::Flatten(*out1);
y_grad.device(place) = y_grad.device(place) =
out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0)); residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
y_grad.device(place) = out_grad * y_grad;
} }
} }
}; };
......
...@@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { ...@@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
if (ctx->HasOutput(framework::GradVarName("Emission"))) { if (ctx->HasOutput(framework::GradVarName("Emission"))) {
ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
} }
if (ctx->HasOutput(framework::GradVarName("Transition"))) { if (ctx->HasOutput(framework::GradVarName("Transition"))) {
ctx->SetOutputDim(framework::GradVarName("Transition"), ctx->SetOutputDim(framework::GradVarName("Transition"),
transition_exps_dims); transition_exps_dims);
ctx->ShareLoD("Transition", framework::GradVarName("Transition"));
} }
} }
......
...@@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> { ...@@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
// server // server
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::prefetch(id_name, out_name, table_names, epmap, operators::distributed::prefetch(id_name, out_name, table_names, epmap,
height_sections, context); height_sections, context,
context.scope());
#else #else
PADDLE_THROW( PADDLE_THROW(
"paddle is not compiled with distribute support, can not do " "paddle is not compiled with distribute support, can not do "
......
...@@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
// server // server
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::prefetch(id_name, out_name, table_names, epmap, operators::distributed::prefetch(id_name, out_name, table_names, epmap,
height_sections, context); height_sections, context,
context.scope());
#else #else
PADDLE_THROW( PADDLE_THROW(
"paddle is not compiled with distribute support, can not do " "paddle is not compiled with distribute support, can not do "
......
...@@ -62,19 +62,27 @@ struct CUBlas<float> { ...@@ -62,19 +62,27 @@ struct CUBlas<float> {
cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Atype, int lda, const void *B,
cudaDataType_t Btype, int ldb, const float *beta, void *C, cudaDataType_t Btype, int ldb, const float *beta, void *C,
cudaDataType_t Ctype, int ldc) { cudaDataType_t Ctype, int ldc) {
// Because the gcc 4.8 doesn't expand template parameter pack that // Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack // appears in a lambda-expression, I can not use template parameter pack
// here. // here.
auto cublas_call = [&]() {
#if CUDA_VERSION >= 8000 #if CUDA_VERSION >= 8000
VLOG(5) << "use_tensor_op_math: " VLOG(5) << "use_tensor_op_math: "
<< (dev_ctx->tensor_core_available() ? "True" : "False"); << (platform::TensorCoreAvailable() ? "True" : "False");
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
beta, C, Ctype, ldc)); lda, B, Btype, ldb, beta, C, Ctype, ldc));
});
#else #else
PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
#else
cublas_call();
#endif #endif
} }
}; };
...@@ -162,10 +170,11 @@ struct CUBlas<platform::float16> { ...@@ -162,10 +170,11 @@ struct CUBlas<platform::float16> {
cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Btype, int ldb, const void *beta, void *C,
cudaDataType_t Ctype, int ldc, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) { cudaDataType_t computeType) {
auto cublas_call = [&]() {
#if CUDA_VERSION >= 8000 #if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available(); bool use_tensor_op_math = platform::TensorCoreAvailable();
if (use_tensor_op_math) { if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP; algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
} }
...@@ -173,13 +182,20 @@ struct CUBlas<platform::float16> { ...@@ -173,13 +182,20 @@ struct CUBlas<platform::float16> {
<< (use_tensor_op_math ? "True" : "False"); << (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000 #endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE(platform::dynload::cublasGemmEx( PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
beta, C, Ctype, ldc, computeType, algo)); lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
});
#else #else
PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
#else
cublas_call();
#endif #endif
} }
}; };
...@@ -207,10 +223,9 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -207,10 +223,9 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CUDA_R_32F, N); CUDA_R_32F, N);
} else { } else {
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
lda, &beta, C, N); &alpha, B, ldb, A, lda, &beta, C, N);
});
#if CUDA_VERSION >= 8000 #if CUDA_VERSION >= 8000
} }
...@@ -251,12 +266,9 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -251,12 +266,9 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
#else #else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
context_.CublasCall([&](cublasHandle_t handle) { N, M, K, &h_alpha, h_B, ldb, h_A, lda,
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &h_beta, h_C, N);
&h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
N);
});
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
} }
...@@ -280,10 +292,8 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M, ...@@ -280,10 +292,8 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
} else { } else {
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) { CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, &alpha, B, ldb, A, lda, &beta, C, ldc);
lda, &beta, C, ldc);
});
#if CUDA_VERSION >= 8000 #if CUDA_VERSION >= 8000
} }
...@@ -301,19 +311,16 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -301,19 +311,16 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) { CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, N, M, K, &alpha, B, ldb, A, lda, &beta, C,
B, ldb, A, lda, &beta, C, ldc); ldc);
});
} }
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x, void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
T *y) const { T *y) const {
context_.CublasCall([&](cublasHandle_t handle) { CUBlas<T>::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1);
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
} }
template <> template <>
...@@ -323,9 +330,8 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N, ...@@ -323,9 +330,8 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
T beta, T *C) const { T beta, T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) { CUBlas<T>::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1,
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); &beta, C, 1);
});
} }
template <> template <>
...@@ -347,28 +353,28 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -347,28 +353,28 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
#if CUDA_VERSION >= 9010 #if CUDA_VERSION >= 9010
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) { if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto cublas_call = [&]() {
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = context_.tensor_core_available(); bool use_tensor_op_math = platform::TensorCoreAvailable();
if (use_tensor_op_math) { if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP; algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
} }
VLOG(5) << "use_tensor_op_math: " VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False"); << (use_tensor_op_math ? "True" : "False");
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
strideC, batchCount, CUDA_R_32F, algo)); CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
}); };
auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
} else { } else {
#endif // CUDA_VERSION >= 9010 #endif // CUDA_VERSION >= 9010
context_.CublasCall([&](cublasHandle_t handle) { CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, N, M, K, &alpha, B, ldb, strideB, A, lda,
B, ldb, strideB, A, lda, strideA, &beta, C, strideA, &beta, C, ldc, strideC, batchCount);
ldc, strideC, batchCount);
});
#if CUDA_VERSION >= 9010 #if CUDA_VERSION >= 9010
} }
......
...@@ -84,41 +84,6 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat, ...@@ -84,41 +84,6 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
code_table_.apply_visitor(func); code_table_.apply_visitor(func);
} }
template <typename T>
struct MatrixBitCodeFunctorSelectedRowsAddGrad
: public boost::static_visitor<void> {
const framework::Tensor &tmat_;
framework::SelectedRows *vec_;
MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
framework::SelectedRows *vec)
: tmat_(tmat), vec_(vec) {}
template <typename CodeTable>
void operator()(const CodeTable &code_table) {
size_t batch_size = tmat_.dims()[0];
size_t width = tmat_.dims()[1];
auto *vec_data = vec_->mutable_value()->template data<T>();
auto *tmat_data = tmat_.data<T>();
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table.get_code(i);
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
vec_data[row_index] += tmat_data[i * width + j];
}
}
}
};
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
framework::SelectedRows *vec) {
MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
code_table_.apply_visitor(func);
}
template <typename T> template <typename T>
struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> { struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
const framework::Tensor &tmat_; const framework::Tensor &tmat_;
......
...@@ -124,11 +124,12 @@ class SimpleCode { ...@@ -124,11 +124,12 @@ class SimpleCode {
template <typename T> template <typename T>
class CustomCode { class CustomCode {
public: public:
CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, CustomCode(const framework::Tensor& path_table,
const int64_t* ids, int index) { const framework::Tensor& path_code, const int64_t* ids,
seq_len_ = ptable.dims()[1]; int index) {
ptable_data_ = ptable.data<T>() + seq_len_ * index; seq_len_ = path_table.dims()[1];
pcode_data_ = pcode.data<T>() + seq_len_ * index; path_table_data_ = path_table.data<T>() + seq_len_ * index;
path_code_data_ = path_code.data<T>() + seq_len_ * index;
} }
/** /**
* Here the id of root should be 1 rather than 0, thus the encoding of class c * Here the id of root should be 1 rather than 0, thus the encoding of class c
...@@ -139,25 +140,25 @@ class CustomCode { ...@@ -139,25 +140,25 @@ class CustomCode {
* Binary classification path is the suffixes of encoding, thus leave out the * Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit. * left most bit in calc_bit.
*/ */
size_t calc_index(int bit) const { return ptable_data_[bit]; } size_t calc_index(int bit) const { return path_table_data_[bit]; }
bool calc_bit(int bit) const { return pcode_data_[bit]; } bool calc_bit(int bit) const { return path_code_data_[bit]; }
// NOTE: this function is not thread-safe. // NOTE: this function is not thread-safe.
int get_length() const { int get_length() const {
if (length_ < 0) { if (length_ < 0) {
auto len = seq_len_; auto len = seq_len_;
length_ = length_ = static_cast<int>(
static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len, std::find_if(path_table_data_, path_table_data_ + len,
[](const T& val) { return val < 0; }) - [](const T& val) { return val < 0; }) -
ptable_data_); path_table_data_);
} }
return length_; return length_;
} }
private: private:
int64_t seq_len_; int64_t seq_len_;
const T* ptable_data_; const T* path_table_data_;
const T* pcode_data_; const T* path_code_data_;
mutable int length_{-1}; mutable int length_{-1};
}; };
...@@ -181,9 +182,9 @@ class SimpleCodeTable { ...@@ -181,9 +182,9 @@ class SimpleCodeTable {
template <typename T> template <typename T>
class CustomCodeTable { class CustomCodeTable {
public: public:
CustomCodeTable(const framework::Tensor& ptable, CustomCodeTable(const framework::Tensor& path_table,
const framework::Tensor& pcode, const int64_t* ids) const framework::Tensor& path_code, const int64_t* ids)
: ptable_(ptable), pcode_(pcode), ids_(ids) {} : ptable_(path_table), pcode_(path_code), ids_(ids) {}
CustomCode<T> get_code(int64_t code) const { CustomCode<T> get_code(int64_t code) const {
return CustomCode<T>(ptable_, pcode_, ids_, code); return CustomCode<T>(ptable_, pcode_, ids_, code);
...@@ -210,11 +211,11 @@ class MatrixBitCodeFunctor { ...@@ -210,11 +211,11 @@ class MatrixBitCodeFunctor {
ids_(ids), ids_(ids),
code_table_(SimpleCodeTable(num_classes, ids)) {} code_table_(SimpleCodeTable(num_classes, ids)) {}
MatrixBitCodeFunctor(const framework::Tensor& ptable, MatrixBitCodeFunctor(const framework::Tensor& path_table,
const framework::Tensor& pcode, const int64_t* ids) const framework::Tensor& path_code, const int64_t* ids)
: num_classes_(static_cast<size_t>(ptable.dims()[1])), : num_classes_(static_cast<size_t>(path_table.dims()[1])),
ids_(ids), ids_(ids),
code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {} code_table_(CustomCodeTable<int64_t>(path_table, path_code, ids)) {}
/* For j < code_length /* For j < code_length
tmat(i, j) += vec(0, index(i, j)) tmat(i, j) += vec(0, index(i, j))
*/ */
...@@ -225,11 +226,6 @@ class MatrixBitCodeFunctor { ...@@ -225,11 +226,6 @@ class MatrixBitCodeFunctor {
*/ */
void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
/* For selected rows For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec);
/* For j < code_length /* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/ */
......
...@@ -153,6 +153,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -153,6 +153,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("is_sparse", "(boolean, default false) Sparse update.") AddAttr<bool>("is_sparse", "(boolean, default false) Sparse update.")
.SetDefault(false); .SetDefault(false);
// for parameter prefetch
AddAttr<bool>("remote_prefetch", "").SetDefault(false);
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<int>>("height_sections",
"Height for each output SelectedRows.")
.SetDefault(std::vector<int>({}));
AddAttr<std::vector<std::string>>(
"epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"table_names",
"(string vector, the splited table names that will be fetched from "
"parameter server)"
"in the order of input variables for mapping")
.SetDefault({});
AddAttr<std::vector<int>>("custom_neg_classes", AddAttr<std::vector<int>>("custom_neg_classes",
"This attribute only be used in unitest. Classes " "This attribute only be used in unitest. Classes "
"in this list wiil be used as negative classes " "in this list wiil be used as negative classes "
...@@ -222,24 +240,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { ...@@ -222,24 +240,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference {
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front();
auto attr = op_desc.GetAttr("is_sparse"); auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad VLOG(3) << "nce_op_grad op " << weight_grad << " and "
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(weight_grad) block->Var(weight_grad)
->SetType(framework::proto::VarType::SELECTED_ROWS); ->SetType(framework::proto::VarType::SELECTED_ROWS);
block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad VLOG(3) << "nce_op_grad op " << weight_grad << " and "
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
} }
block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType());
} }
}; };
......
...@@ -15,8 +15,10 @@ limitations under the License. */ ...@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <math.h> #include <math.h>
#include <iterator>
#include <random> #include <random>
#include <set> #include <set>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -24,6 +26,10 @@ limitations under the License. */ ...@@ -24,6 +26,10 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/operators/math/sampler.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -43,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context, ...@@ -43,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context,
auto label = context.Input<Tensor>("Label"); auto label = context.Input<Tensor>("Label");
const int64_t *label_data = label->data<int64_t>(); const int64_t *label_data = label->data<int64_t>();
auto label_dims = label->dims(); auto label_dims = label->dims();
// int num_total_classes = context.Attr<int>("num_total_classes");
// for unitest // for unitest
std::vector<int> custom_neg_classes = std::vector<int> custom_neg_classes =
context.Attr<std::vector<int>>("custom_neg_classes"); context.Attr<std::vector<int>>("custom_neg_classes");
...@@ -144,7 +149,72 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -144,7 +149,72 @@ class NCEKernel : public framework::OpKernel<T> {
} }
// forward mul // forward mul
auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input"))); auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
// for remote prefetch
auto epmap = context.Attr<std::vector<std::string>>("epmap");
if (!epmap.empty()) {
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
std::vector<int64_t> labels;
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
labels.push_back(sample_labels_data[i]);
}
std::set<T> st(labels.begin(), labels.end());
labels.assign(st.begin(), st.end());
framework::Scope &local_scope = context.scope().NewScope();
auto height_sections = context.Attr<std::vector<int>>("height_sections");
auto table_names = context.Attr<std::vector<std::string>>("table_names");
auto *ids = local_scope.Var("Ids@Prefetch");
auto *x_tensor = ids->GetMutable<framework::LoDTensor>();
x_tensor->mutable_data<int64_t>(
framework::make_ddim({static_cast<int64_t>(labels.size()), 1}),
context.GetPlace());
// copy.
std::memcpy(x_tensor->data<int64_t>(), labels.data(),
labels.size() * sizeof(int64_t));
std::vector<int> w_dims = paddle::framework::vectorize2int(
context.Input<Tensor>("Weight")->dims());
w_dims[0] = static_cast<int>(labels.size());
auto *w_tensor = local_scope.Var("Weight@Prefetch")
->GetMutable<framework::LoDTensor>();
w_tensor->Resize(framework::make_ddim(w_dims));
#ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
table_names, epmap, height_sections,
context, local_scope);
#else
PADDLE_THROW(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!");
#endif
auto weight_mat = EigenMatrix<T>::From(
(local_scope.Var("Weight@Prefetch")->Get<framework::LoDTensor>()));
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
std::vector<int64_t>::iterator it =
std::find(labels.begin(), labels.end(), sample_labels_data[i]);
int idx = std::distance(labels.begin(), it);
Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
(input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
weight_mat.chip(idx, 0))
.sum();
sample_out_data[i] += result(0);
sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
}
context.scope().DeleteScope(&local_scope);
} else {
auto weight_mat =
EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
for (int64_t i = 0; i < sample_labels->numel(); ++i) { for (int64_t i = 0; i < sample_labels->numel(); ++i) {
Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result = Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
(input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) * (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
...@@ -153,6 +223,8 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -153,6 +223,8 @@ class NCEKernel : public framework::OpKernel<T> {
sample_out_data[i] += result(0); sample_out_data[i] += result(0);
sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
} }
}
// forward cost // forward cost
for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
out_data[i] = 0; out_data[i] = 0;
...@@ -240,9 +312,6 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -240,9 +312,6 @@ class NCEGradKernel : public framework::OpKernel<T> {
sample_grad_data[i] *= d_out_data[sample_idx]; sample_grad_data[i] *= d_out_data[sample_idx];
} }
bool is_sparse = context.Attr<bool>("is_sparse");
if (!is_sparse) {
// get d_bias // get d_bias
auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias")); auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
if (d_bias != nullptr) { if (d_bias != nullptr) {
...@@ -252,6 +321,10 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -252,6 +321,10 @@ class NCEGradKernel : public framework::OpKernel<T> {
d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
} }
} }
bool is_sparse = context.Attr<bool>("is_sparse");
if (!is_sparse) {
// get d_w // get d_w
auto d_w = context.Output<Tensor>(framework::GradVarName("Weight")); auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
if (d_w != nullptr) { if (d_w != nullptr) {
...@@ -273,34 +346,6 @@ class NCEGradKernel : public framework::OpKernel<T> { ...@@ -273,34 +346,6 @@ class NCEGradKernel : public framework::OpKernel<T> {
std::set<T> st(labels.begin(), labels.end()); std::set<T> st(labels.begin(), labels.end());
labels.assign(st.begin(), st.end()); labels.assign(st.begin(), st.end());
auto *bias_var = context.InputVar("Bias");
DDim bias_dim;
if (bias_var->IsType<LoDTensor>()) {
bias_dim = context.Input<LoDTensor>("Bias")->dims();
} else if (bias_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("Bias");
bias_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter Bias of a NCE_OP "
"must be either LoDTensor or SelectedRows");
}
auto d_bias =
context.Output<SelectedRows>(framework::GradVarName("Bias"));
d_bias->set_rows(labels);
d_bias->set_height(bias_dim[0]);
d_bias->mutable_value()->Resize(
{static_cast<int64_t>(labels.size()), bias_dim[1]});
T *d_bias_data =
d_bias->mutable_value()->mutable_data<T>(context.GetPlace());
std::fill(d_bias_data, d_bias_data + labels.size(), 0.0);
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
d_bias_data[d_bias->Index(sample_labels_data[i])] +=
sample_grad_data[i];
}
auto *table_var = context.InputVar("Weight"); auto *table_var = context.InputVar("Weight");
DDim table_dim; DDim table_dim;
if (table_var->IsType<LoDTensor>()) { if (table_var->IsType<LoDTensor>()) {
......
...@@ -23,5 +23,7 @@ limitations under the License. */ ...@@ -23,5 +23,7 @@ limitations under the License. */
#include "ops/binary_unnary_op.h" #include "ops/binary_unnary_op.h"
#include "ops/fill_constant_op.h" #include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/mul_op.h" #include "ops/mul_op.h"
#include "ops/scale_op.h"
#include "ops/top_k_op.h" #include "ops/top_k_op.h"
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
template <typename T>
std::shared_ptr<ngraph::Node> ElementwiseScalar(
float scale, std::shared_ptr<ngraph::Node> node) {
auto node_shape = node->get_shape();
auto scale_const = ngraph::op::Constant::create(node->get_element_type(),
node_shape, {scale});
return std::make_shared<T>(scale_const, node);
}
template <typename T>
std::shared_ptr<ngraph::Node> ElementwiseScalar(
std::shared_ptr<ngraph::Node> scale_1d,
std::shared_ptr<ngraph::Node> node) {
auto scale_shape = scale_1d->get_shape();
PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node");
PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}");
auto node_shape = node->get_shape();
ngraph::AxisSet axis_set;
for (size_t i = 0; i < node_shape.size(); ++i) {
axis_set.insert(i);
}
node_shape.push_back(1);
auto scale_bcast =
std::make_shared<ngraph::op::Broadcast>(scale_1d, node_shape, axis_set);
auto scale_reshape =
paddle::platform::NgReshaper(scale_bcast, node->get_shape());
return std::make_shared<T>(scale_reshape, node);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <functional>
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildMeanNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
ngraph::AxisSet axes;
for (size_t i = 0; i < input->get_shape().size(); ++i) {
axes.insert(i);
}
auto mean = ngraph::builder::mean(input, axes);
auto mean_1d = std::make_shared<ngraph::op::Reshape>(
mean, ngraph::AxisVector{}, ngraph::Shape{1});
paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map);
}
void BuildMeanGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto x_shape = x->get_shape();
float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1,
std::multiplies<float>());
auto node_const = ngraph::op::Constant::create(og->get_element_type(),
ngraph::Shape{1}, {x_size});
auto node_div = std::make_shared<ngraph::op::Divide>(og, node_const);
auto result = ElementwiseScalar<ngraph::op::Add>(
og / node_const,
ngraph::op::Constant::create(og->get_element_type(), x_shape, {0}));
paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildScaleNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
float scale = op_attrs.Get<float>("scale");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto out = ElementwiseScalar<ngraph::op::Multiply>(scale, x);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
...@@ -424,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -424,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
} }
} }
framework::SelectedRows cpu_grad_merge;
const framework::SelectedRows* grad_merge_ptr; const framework::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) { if (is_strict_sorted) {
grad_merge_ptr = &grad; grad_merge_ptr = &grad;
} else { } else {
// merge duplicated rows if any. // merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor // The rows of grad_merge have been sorted inside MergeAdd functor
framework::SelectedRows* grad_merge_var;
scatter::MergeAdd<DeviceContext, T> merge_func; scatter::MergeAdd<DeviceContext, T> merge_func;
auto* grad_merge_var = const_cast<framework::Scope&>(ctx.scope()) if (platform::is_cpu_place(ctx.GetPlace())) {
grad_merge_var = &cpu_grad_merge;
} else {
// FIXME(qiao): GPU also need to fix this
grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
.Var() .Var()
->GetMutable<framework::SelectedRows>(); ->GetMutable<framework::SelectedRows>();
}
merge_func(ctx.template device_context<DeviceContext>(), grad, merge_func(ctx.template device_context<DeviceContext>(), grad,
grad_merge_var, true); grad_merge_var, true);
grad_merge_ptr = grad_merge_var; grad_merge_ptr = grad_merge_var;
......
...@@ -49,7 +49,7 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status, ...@@ -49,7 +49,7 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
class CTRReader : public framework::FileReader { class CTRReader : public framework::FileReader {
public: public:
explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue, explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
int batch_size, int thread_num, int batch_size, size_t thread_num,
const std::vector<std::string>& slots, const std::vector<std::string>& slots,
const std::vector<std::string>& file_list) const std::vector<std::string>& file_list)
: batch_size_(batch_size), slots_(slots), file_list_(file_list) { : batch_size_(batch_size), slots_(slots), file_list_(file_list) {
......
...@@ -100,7 +100,7 @@ ENDIF() ...@@ -100,7 +100,7 @@ ENDIF()
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
if(WITH_GPU) if(WITH_GPU)
nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator)
else() else()
cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator)
endif() endif()
...@@ -245,15 +245,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -245,15 +245,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
if (TensorCoreAvailable()) {
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_.reset(
new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
#endif
}
if (dynload::HasCUDNN()) { if (dynload::HasCUDNN()) {
cudnn_holder_.reset(new CudnnHolder(&stream_, place)); cudnn_holder_.reset(new CudnnHolder(&stream_, place));
} }
...@@ -313,8 +306,7 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -313,8 +306,7 @@ CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device); SetDeviceId(place_.device);
Wait(); Wait();
WaitStreamCallback(); WaitStreamCallback();
cublas_handle_.reset(); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
cublas_tensor_core_handle_.reset();
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_));
...@@ -343,8 +335,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { ...@@ -343,8 +335,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
bool CUDADeviceContext::tensor_core_available() const { cublasHandle_t CUDADeviceContext::cublas_handle() const {
return cublas_tensor_core_handle_ != nullptr; return cublas_handle_;
} }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
......
...@@ -20,7 +20,6 @@ limitations under the License. */ ...@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/platform/temporary_allocator.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
...@@ -210,6 +209,39 @@ class CudnnWorkspaceHandle { ...@@ -210,6 +209,39 @@ class CudnnWorkspaceHandle {
std::unique_ptr<std::lock_guard<std::mutex>> guard_; std::unique_ptr<std::lock_guard<std::mutex>> guard_;
}; };
#if CUDA_VERSION >= 9000
class ScopedCublasMathMode {
public:
ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
: handle_(handle) {
need_reset = false;
PADDLE_ENFORCE(
platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
"Failed to get old cublas math mode");
if (old_math_mode_ != new_math_mode) {
PADDLE_ENFORCE(
platform::dynload::cublasSetMathMode(handle_, new_math_mode),
"Failed to set old cublas math mode");
need_reset = true;
}
}
~ScopedCublasMathMode() {
if (need_reset) {
PADDLE_ENFORCE(
platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
"Failed to set old cublas math mode");
}
}
private:
cublasHandle_t handle_;
cublasMath_t old_math_mode_;
bool need_reset;
};
#endif
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public DeviceContext {
public: public:
explicit CUDADeviceContext(CUDAPlace place); explicit CUDADeviceContext(CUDAPlace place);
...@@ -230,25 +262,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -230,25 +262,8 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
/*! \brief Call cublas function safely. */ /*! \brief Return cublas handle in the device context. */
template <typename Callback> cublasHandle_t cublas_handle() const;
inline void CublasCall(Callback&& callback) const {
cublas_handle_->Call(std::forward<Callback>(callback));
}
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback>
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
} else {
cublas_handle_->Call(std::forward<Callback>(callback));
}
}
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle() const; cudnnHandle_t cudnn_handle() const;
...@@ -267,6 +282,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -267,6 +282,7 @@ class CUDADeviceContext : public DeviceContext {
template <typename Callback> template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) { void RecordEvent(cudaEvent_t ev, Callback callback) {
std::lock_guard<std::mutex> guard(mtx_);
callback(); callback();
PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
} }
...@@ -278,6 +294,18 @@ class CUDADeviceContext : public DeviceContext { ...@@ -278,6 +294,18 @@ class CUDADeviceContext : public DeviceContext {
void WaitStreamCallback() const { callback_manager_->Wait(); } void WaitStreamCallback() const { callback_manager_->Wait(); }
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
* but the cublas may be hold by multi-thread, so we should
* add lock here. */
template <typename Callback>
void CublasCall(Callback callback, cublasMath_t new_math) {
std::lock_guard<std::mutex> guard(cublas_mtx_);
ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
callback();
}
#endif
private: private:
CUDAPlace place_; CUDAPlace place_;
...@@ -285,9 +313,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -285,9 +313,7 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<CudnnHolder> cudnn_holder_; std::unique_ptr<CudnnHolder> cudnn_holder_;
cudaStream_t stream_; cudaStream_t stream_;
cublasHandle_t cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
int compute_capability_; int compute_capability_;
int runtime_version_; int runtime_version_;
...@@ -295,10 +321,12 @@ class CUDADeviceContext : public DeviceContext { ...@@ -295,10 +321,12 @@ class CUDADeviceContext : public DeviceContext {
int multi_process_; int multi_process_;
int max_threads_per_mp_; int max_threads_per_mp_;
mutable std::mutex mtx_;
// StreamCallbackManager is thread-safe // StreamCallbackManager is thread-safe
std::unique_ptr<StreamCallbackManager> callback_manager_; std::unique_ptr<StreamCallbackManager> callback_manager_;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); mutable std::mutex cublas_mtx_;
}; };
template <> template <>
......
...@@ -43,6 +43,9 @@ TEST(Device, CUDADeviceContext) { ...@@ -43,6 +43,9 @@ TEST(Device, CUDADeviceContext) {
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
ASSERT_NE(nullptr, cudnn_handle); ASSERT_NE(nullptr, cudnn_handle);
cublasHandle_t cublas_handle = device_context->cublas_handle();
ASSERT_NE(nullptr, cublas_handle);
ASSERT_NE(nullptr, device_context->stream());
delete device_context; delete device_context;
} }
} }
......
...@@ -145,7 +145,8 @@ class MKLDNNHandler { ...@@ -145,7 +145,8 @@ class MKLDNNHandler {
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix, const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline, // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) { bool is_persistent = false, bool is_INT8 = false,
std::vector<float> scale_data = {1.0f}, int mask = 0) {
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix; auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p"; auto key_reorder_p = key_ + suffix + "reorder_p";
...@@ -159,8 +160,20 @@ class MKLDNNHandler { ...@@ -159,8 +160,20 @@ class MKLDNNHandler {
std::shared_ptr<mkldnn::primitive> reorder_p; std::shared_ptr<mkldnn::primitive> reorder_p;
if (mpd != user_mpd) { if (mpd != user_mpd) {
target_memory_p = std::make_shared<mkldnn::memory>(mpd); target_memory_p = std::make_shared<mkldnn::memory>(mpd);
auto reorder_p = std::shared_ptr<mkldnn::reorder> reorder_p;
std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p); if (is_INT8) {
mkldnn::primitive_attr
attri; // attribute for int8 weights and bias data reorder.
attri.set_output_scales(mask, scale_data);
auto reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri));
reorder_p = std::shared_ptr<mkldnn::reorder>(new mkldnn::reorder(
*reorder_pd, *user_memory_p, *target_memory_p));
} else {
reorder_p = std::make_shared<mkldnn::reorder>(*user_memory_p,
*target_memory_p);
}
dev_ctx_.SetBlob(key_reorder_p, reorder_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p);
pipeline.push_back(*reorder_p); pipeline.push_back(*reorder_p);
} }
...@@ -182,22 +195,58 @@ class MKLDNNHandler { ...@@ -182,22 +195,58 @@ class MKLDNNHandler {
return dims2str(operand_dims) + suffix; return dims2str(operand_dims) + suffix;
} }
template <typename M> template <typename T>
static void SetDstMemory( static void SetDstMemory(
const framework::ExecutionContext& ctx, framework::Tensor* output, const framework::ExecutionContext& ctx, framework::Tensor* output,
std::vector<int> dst_tz, const mkldnn::engine& engine, std::vector<int> dst_tz, const mkldnn::engine& engine,
std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd, // NOLINT std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd, // NOLINT
std::shared_ptr<mkldnn::memory>& dst_memory) { // NOLINT std::shared_ptr<mkldnn::memory>& dst_memory) { // NOLINT
M* output_data = output->mutable_data<M>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
{dst_tz}, paddle::framework::ToMKLDNNDataType( {dst_tz}, paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<M>::DataType), framework::DataTypeTrait<T>::DataType),
mkldnn::memory::format::nhwc); mkldnn::memory::format::nhwc);
dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine));
dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<M>(output_data))); dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<T>(output_data)));
}
static void AppendKey(
std::string* key, const mkldnn::memory::dims& input_dims,
const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& dilations,
const int& groups, const mkldnn::memory::data_type& srcdt,
const mkldnn::memory::format& format,
const mkldnn::memory::data_type& dstdt, const std::string& suffix) {
AppendKeyDims(key, input_dims);
AppendKeyDims(key, weights_dims);
AppendKeyVec(key, strides);
AppendKeyVec(key, paddings);
AppendKeyVec(key, dilations);
AppendKey(key, std::to_string(groups));
AppendKey(key, std::to_string(srcdt));
AppendKey(key, std::to_string(format));
AppendKey(key, std::to_string(dstdt));
AppendKey(key, suffix);
} }
protected: protected:
static void AppendKeyDims(std::string* key,
const mkldnn::memory::dims& dims) {
for (unsigned int i = 0; i < dims.size(); i++) {
AppendKey(key, std::to_string(dims[i]));
}
}
static void AppendKeyVec(std::string* key, const std::vector<int>& dims) {
for (unsigned int i = 0; i < dims.size(); i++) {
AppendKey(key, std::to_string(dims[i]));
}
}
static void AppendKey(std::string* key, const std::string& s) {
key->append(s);
}
static std::string dims2str(const mkldnn::memory::dims& operand_dims) { static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = ""; std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) { for (size_t i = 0; i < operand_dims.size(); ++i) {
...@@ -215,7 +264,8 @@ class MKLDNNHandler { ...@@ -215,7 +264,8 @@ class MKLDNNHandler {
class TransposeMKLDNNHandler : public MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler {
public: public:
TransposeMKLDNNHandler(std::vector<int>& dims, std::vector<int>& axis, TransposeMKLDNNHandler(std::vector<int>& dims, // NOLINT
std::vector<int>& axis, // NOLINT
const platform::MKLDNNDeviceContext& dev_ctx, const platform::MKLDNNDeviceContext& dev_ctx,
mkldnn::engine engine, const std::string& base_key) mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key), : platform::MKLDNNHandler(dev_ctx, engine, base_key),
...@@ -303,8 +353,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { ...@@ -303,8 +353,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
} }
protected: protected:
mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz, mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz, // NOLINT
std::vector<int>& axis) { std::vector<int>& axis // NOLINT
) {
mkldnn_memory_desc_t mem_fmt; mkldnn_memory_desc_t mem_fmt;
mem_fmt.primitive_kind = mkldnn_memory; mem_fmt.primitive_kind = mkldnn_memory;
...@@ -462,21 +513,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { ...@@ -462,21 +513,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p, const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline, // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) { bool is_persistent = false, bool is_INT8 = false,
std::vector<float> scale_data = {1.0f}, int mask = 0) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd, return this->AcquireMemory(
user_weights_memory_p, "@weights_mem_p", weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p",
pipeline, is_persistent); pipeline, is_persistent, is_INT8, scale_data, mask);
} }
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_bias_memory_p, const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false, bool is_INT8 = false,
std::vector<float> scale_data = {1.0f},
int mask = 0) { // NOLINT
auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
auto bias_pd = conv_pd_->bias_primitive_desc(); auto bias_pd = conv_pd_->bias_primitive_desc();
return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
"@bias_mem_p", pipeline); "@bias_mem_p", pipeline, is_persistent, is_INT8,
scale_data, mask);
} }
std::shared_ptr<forward_t> AcquireConvolution( std::shared_ptr<forward_t> AcquireConvolution(
...@@ -594,5 +650,29 @@ using ConvTransposeMKLDNNHandler = ...@@ -594,5 +650,29 @@ using ConvTransposeMKLDNNHandler =
ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward, ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
mkldnn::deconvolution_backward_data, mkldnn::deconvolution_backward_data,
mkldnn::deconvolution_backward_weights>; mkldnn::deconvolution_backward_weights>;
template <typename T>
static std::shared_ptr<mkldnn::memory> SetDstMemory(
const framework::ExecutionContext& ctx, framework::Tensor* output,
const std::shared_ptr<ConvMKLDNNHandler>& handler) {
T* output_data = output->mutable_data<T>(
ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
handler->GetDstMemorySize());
std::shared_ptr<mkldnn::memory> dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
return dst_memory_p;
}
template <typename T>
static std::shared_ptr<mkldnn::memory> SetDstMemoryHandler(
const framework::ExecutionContext& ctx, framework::Tensor* output,
const std::shared_ptr<ConvMKLDNNHandler>& handler) {
T* output_data = output->mutable_data<T>(
ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
handler->GetDstMemorySize());
std::shared_ptr<mkldnn::memory> dst_memory_p;
dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
return dst_memory_p;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -106,7 +106,7 @@ struct NCCLContextMap { ...@@ -106,7 +106,7 @@ struct NCCLContextMap {
} }
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]); std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
// if num_trainers == 1, should create a new nccl id for local comms. // if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1) { if (num_trainers == 1 && nccl_id == nullptr) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex()); std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data())); comms.get(), static_cast<int>(order_.size()), order_.data()));
......
...@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/port.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -25,9 +22,12 @@ limitations under the License. */ ...@@ -25,9 +22,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
...@@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { ...@@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> l(profiler_mu);
is_enabled_ = true; is_enabled_ = true;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
...@@ -184,8 +185,8 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) ...@@ -184,8 +185,8 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
std::lock_guard<std::mutex> l(profiler_mu);
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
......
...@@ -14,12 +14,27 @@ ...@@ -14,12 +14,27 @@
#include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/platform/temporary_allocator.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
DECLARE_double(limit_of_temporary_allocation); DECLARE_double(limit_of_temporary_allocation);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
class DummyOp : public framework::OperatorBase {
public:
DummyOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
protected:
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {}
};
TEST(temporary_allocator, temporary_allocator) { TEST(temporary_allocator, temporary_allocator) {
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
TemporaryAllocator alloc(cpu_place); TemporaryAllocator alloc(cpu_place);
...@@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) { ...@@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) {
} }
TEST(temporary_allocator, create_tensor_with_allocationptr) { TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform::CPUPlace cpu_place; framework::VariableNameMap dummy_vars;
TemporaryAllocator cpu_alloc(cpu_place); framework::AttributeMap dummy_attrs;
DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs);
framework::Scope scope;
framework::VariableValueMap vars;
framework::RuntimeContext run_ctx(vars, vars);
size_t memory_size = 300;
{ {
size_t memory_size = 200; platform::CPUPlace cpu_place;
auto allocation = cpu_alloc.Allocate(memory_size); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
void* address = allocation->ptr(); auto* dev_ctx =
static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
int numel = memory_size / sizeof(float); int numel = memory_size / sizeof(float);
framework::Tensor tensor = framework::GetTensor<float>( framework::Tensor tensor =
std::move(allocation), framework::make_ddim({numel})); ctx.AllocateTmpTensor<float, platform::CPUDeviceContext>(
PADDLE_ENFORCE_EQ(address, tensor.data<float>()); framework::make_ddim({numel}), *dev_ctx);
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place);
{ {
size_t memory_size = 300; platform::CUDAPlace gpu_place(0);
auto allocation = gpu_alloc.Allocate(memory_size); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
void* address = allocation->ptr(); auto* dev_ctx =
static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
int numel = memory_size / sizeof(float); int numel = memory_size / sizeof(float);
framework::Tensor tensor = framework::GetTensor<float>( framework::Tensor tensor =
std::move(allocation), framework::make_ddim({numel})); ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
PADDLE_ENFORCE_EQ(address, tensor.data<float>()); framework::make_ddim({numel}), *dev_ctx);
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
} }
// The allocation is not holded now, it should be placed to
// TemporaryAllocationQueue.
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
gpu_alloc.Release([]() {});
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
#endif #endif
} }
TEST(temporary_allocator, create_tensor_with_allocationptr2) { TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform::CPUPlace cpu_place; framework::VariableNameMap dummy_vars;
TemporaryAllocator cpu_alloc(cpu_place); framework::AttributeMap dummy_attrs;
{ DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs);
framework::Scope scope;
framework::VariableValueMap vars;
framework::RuntimeContext run_ctx(vars, vars);
size_t memory_size = 400; size_t memory_size = 400;
{
platform::CPUPlace cpu_place;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx =
static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
int numel = memory_size / sizeof(float); int numel = memory_size / sizeof(float);
framework::Tensor out_side_tensor; framework::Tensor out_side_tensor;
void* address;
{ {
auto allocation = cpu_alloc.Allocate(memory_size); framework::Tensor tensor =
address = allocation->ptr(); ctx.AllocateTmpTensor<float, platform::CPUDeviceContext>(
framework::Tensor tensor = framework::GetTensor<float>( framework::make_ddim({numel}), *dev_ctx);
std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
out_side_tensor.ShareDataWith(tensor); out_side_tensor.ShareDataWith(tensor);
} }
PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place);
{ {
void* address; platform::CUDAPlace gpu_place(0);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx =
static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
size_t memory_size = 500; size_t memory_size = 500;
int numel = memory_size / sizeof(float); int numel = memory_size / sizeof(float);
framework::Tensor out_side_tensor; framework::Tensor out_side_tensor;
{ {
auto allocation = gpu_alloc.Allocate(memory_size); framework::Tensor tensor =
address = allocation->ptr(); ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
framework::Tensor tensor = framework::GetTensor<float>( framework::make_ddim({numel}), *dev_ctx);
std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
out_side_tensor.ShareDataWith(tensor); out_side_tensor.ShareDataWith(tensor);
} }
PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
// The allocation is holded by out_side_tensor.
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
gpu_alloc.Release([]() {});
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
} }
// The allocation is not holded now, it should be placed to
// TemporaryAllocationQueue.
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
gpu_alloc.Release([]() {});
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
#endif #endif
} }
......
...@@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(The type is STR, debug_graphviz_path indicate the path that R"DOC(The type is STR, debug_graphviz_path indicate the path that
writing the SSA Graph to file in the form of graphviz, you. writing the SSA Graph to file in the form of graphviz, you.
It is useful for debugging. Default "")DOC") It is useful for debugging. Default "")DOC")
.def_property(
"enable_data_balance",
[](const BuildStrategy &self) { return self.enable_data_balance_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.enable_data_balance_ = b;
}) // FIXME(chengudo): enable_data_balance seems not important
.def_property( .def_property(
"enable_sequential_execution", "enable_sequential_execution",
[](const BuildStrategy &self) { [](const BuildStrategy &self) {
...@@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle.
"memory_optimize", "memory_optimize",
[](const BuildStrategy &self) { return self.memory_optimize_; }, [](const BuildStrategy &self) { return self.memory_optimize_; },
[](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
.def_property(
"is_distribution",
[](const BuildStrategy &self) { return self.is_distribution_; },
[](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
.def_property( .def_property(
"memory_early_delete", "memory_early_delete",
[](const BuildStrategy &self) { return self.memory_early_delete_; }, [](const BuildStrategy &self) { return self.memory_early_delete_; },
......
...@@ -199,6 +199,7 @@ function cmake_gen() { ...@@ -199,6 +199,7 @@ function cmake_gen() {
-DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON} -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because # Disable UNITTEST_USE_VIRTUALENV in docker because
...@@ -232,7 +233,8 @@ EOF ...@@ -232,7 +233,8 @@ EOF
-DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
-DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\ -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
-DPY_VERSION=${PY_VERSION:-2.7} \ -DPY_VERSION=${PY_VERSION:-2.7} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
} }
...@@ -918,11 +920,11 @@ function main() { ...@@ -918,11 +920,11 @@ function main() {
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build build
assert_api_not_changed ${PYTHON_ABI:-""} assert_api_not_changed ${PYTHON_ABI:-""}
assert_api_spec_approvals
run_test run_test
gen_capi_package gen_capi_package
gen_fluid_lib gen_fluid_lib
test_fluid_lib test_fluid_lib
assert_api_spec_approvals
;; ;;
assert_api) assert_api)
assert_api_not_changed ${PYTHON_ABI:-""} assert_api_not_changed ${PYTHON_ABI:-""}
......
...@@ -102,13 +102,6 @@ def __bootstrap__(): ...@@ -102,13 +102,6 @@ def __bootstrap__():
import sys import sys
import os import os
import platform import platform
if os.name == 'nt':
third_lib_path = os.path.abspath(os.path.dirname(
__file__)) + os.sep + '..' + os.sep + 'libs'
os.environ['path'] += ';' + third_lib_path
sys.path.append(third_lib_path)
from . import core from . import core
in_test = 'unittest' in sys.modules in_test = 'unittest' in sys.modules
...@@ -135,7 +128,8 @@ def __bootstrap__(): ...@@ -135,7 +128,8 @@ def __bootstrap__():
'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'enable_parallel_graph'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
...@@ -158,14 +152,10 @@ def __bootstrap__(): ...@@ -158,14 +152,10 @@ def __bootstrap__():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'enable_cublas_tensor_op_math', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
'conv_workspace_size_limit', 'cudnn_exhaustive_search_times', 'sync_nccl_allreduce'
'cudnn_exhaustive_search',
'memory_optimize_debug',
'selected_gpus',
'cudnn_exhaustive_search_times',
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
......
...@@ -26,6 +26,13 @@ import numpy as np ...@@ -26,6 +26,13 @@ import numpy as np
from .. import compat as cpt from .. import compat as cpt
from .proto import framework_pb2 from .proto import framework_pb2
try: try:
if os.name == 'nt':
import sys
third_lib_path = os.path.abspath(os.path.dirname(
__file__)) + os.sep + '..' + os.sep + 'libs'
os.environ['path'] += ';' + third_lib_path
sys.path.append(third_lib_path)
from . import core from . import core
except ImportError as e: except ImportError as e:
if os.name == 'nt': if os.name == 'nt':
......
...@@ -26,7 +26,7 @@ from ..initializer import Normal, Constant ...@@ -26,7 +26,7 @@ from ..initializer import Normal, Constant
from ..framework import Variable, OpProtoHolder from ..framework import Variable, OpProtoHolder
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
from .tensor import concat from .tensor import concat, assign
from . import utils from . import utils
from .. import unique_name from .. import unique_name
from functools import reduce from functools import reduce
...@@ -340,9 +340,7 @@ def embedding(input, ...@@ -340,9 +340,7 @@ def embedding(input,
""" """
helper = LayerHelper('embedding', **locals()) helper = LayerHelper('embedding', **locals())
remote_prefetch = False remote_prefetch = is_sparse and (not is_distributed)
if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'):
remote_prefetch = True
if remote_prefetch: if remote_prefetch:
assert is_sparse is True and is_distributed is False assert is_sparse is True and is_distributed is False
w = helper.create_parameter( w = helper.create_parameter(
...@@ -5032,12 +5030,18 @@ def nce(input, ...@@ -5032,12 +5030,18 @@ def nce(input,
else: else:
num_neg_samples = int(num_neg_samples) num_neg_samples = int(num_neg_samples)
remote_prefetch = is_sparse
print(
"With sparse mode, if your models has only small parameter prefetch may cause speed down"
)
attrs = { attrs = {
'num_total_classes': int(num_total_classes), 'num_total_classes': int(num_total_classes),
'num_neg_samples': num_neg_samples, 'num_neg_samples': num_neg_samples,
'seed': seed, 'seed': seed,
'sampler': sampler, 'sampler': sampler,
'is_sparse': is_sparse 'is_sparse': is_sparse,
'remote_prefetch': remote_prefetch
} }
helper.append_op( helper.append_op(
...@@ -5147,7 +5151,10 @@ def hsigmoid(input, ...@@ -5147,7 +5151,10 @@ def hsigmoid(input,
pass pass
weights = None weights = None
remote_prefetch = is_sparse
print(
"With sparse mode, if your models has only small parameter prefetch may cause speed down"
)
if not is_custom: if not is_custom:
weights = helper.create_parameter( weights = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
...@@ -5163,7 +5170,7 @@ def hsigmoid(input, ...@@ -5163,7 +5170,7 @@ def hsigmoid(input,
inputs = { inputs = {
"X": input, "X": input,
"W": weights, "W": weights,
"PTable": path_table, "PathTable": path_table,
"PathCode": path_code, "PathCode": path_code,
"Label": label "Label": label
} }
...@@ -5186,9 +5193,13 @@ def hsigmoid(input, ...@@ -5186,9 +5193,13 @@ def hsigmoid(input,
type="hierarchical_sigmoid", type="hierarchical_sigmoid",
inputs=inputs, inputs=inputs,
outputs={"Out": out, outputs={"Out": out,
"PreOut": pre_out}, "PreOut": pre_out,
attrs={"num_classes": num_classes, "W_Out": weights},
"is_sparse": is_sparse}) attrs={
"num_classes": num_classes,
"is_sparse": is_sparse,
"remote_prefetch": remote_prefetch
})
return out return out
......
...@@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy ...@@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy = core.ParallelExecutor.BuildStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy
def _is_pserver_mode(main_program):
main = main_program if main_program \
else framework.default_main_program()
for op in main.global_block().ops:
if op.type in ["send", "recv"]:
return True
return False
class ParallelExecutor(object): class ParallelExecutor(object):
""" """
ParallelExecutor is designed for data parallelism, which focuses on distributing ParallelExecutor is designed for data parallelism, which focuses on distributing
...@@ -128,6 +137,11 @@ class ParallelExecutor(object): ...@@ -128,6 +137,11 @@ class ParallelExecutor(object):
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id build_strategy.trainer_id = trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy.is_distribution = _is_pserver_mode(
main_program) or num_trainers > 1
# step4: get main_program, scope, local_scopes # step4: get main_program, scope, local_scopes
main = main_program if main_program \ main = main_program if main_program \
......
...@@ -21,18 +21,19 @@ if(NOT WITH_DISTRIBUTE) ...@@ -21,18 +21,19 @@ if(NOT WITH_DISTRIBUTE)
LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
if (NOT ${WITH_GPU}) if (NOT ${WITH_GPU})
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) elseif(${CUDNN_VERSION} VERSION_LESS 7100)
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
endif() endif()
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp
class TestNGRAPHMeanOp(TestMeanOp):
def setUp(self):
super(TestNGRAPHMeanOp, self).setUp()
class TestNGRAPHFP16MeanOp(TestFP16MeanOp):
def setUp(self):
super(TestNGRAPHFP16MeanOp, self).setUp()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows
class TestNGRAPHScaleOp(TestScaleOp):
def init_dtype_type(self):
pass
class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
def init_dtype_type(self):
pass
class TestNGRAPHScaleFp16Op(TestScaleFp16Op):
def init_dtype_type(self):
pass
class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows):
def init_dtype_type(self):
pass
if __name__ == "__main__":
unittest.main()
...@@ -78,7 +78,6 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -78,7 +78,6 @@ class TestParallelExecutorBase(unittest.TestCase):
exec_strategy.allow_op_delay = allow_op_delay exec_strategy.allow_op_delay = allow_op_delay
if use_fast_executor: if use_fast_executor:
exec_strategy.use_experimental_executor = True exec_strategy.use_experimental_executor = True
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
......
...@@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest): ...@@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest):
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype)
self.output = conv2d_forward_naive(input, filter, self.groups, self.output, _, _, _, _ = conv2d_forward_naive(
conv2d_param).astype(self.dtype) input, filter, self.groups, conv2d_param)
self.output = self.output.astype(self.dtype)
self.inputs = { self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input), 'Input': OpTest.np_dtype_to_fluid_dtype(input),
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
def conv2d_forward_refer(input, filter, group, conv_param):
out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
conv_param)
out_tmp = np.zeros((in_n, out_h, out_w, out_c))
for n in range(in_n):
for i in range(out_h):
for j in range(out_w):
for m in range(out_c):
out_tmp[n, i, j, m] = out[n, m, i, j]
return out_tmp.reshape(in_n, out_c, out_h, out_w)
class TestConv2dInt8Op(TestConv2dOp):
def setUp(self):
self.op_type = "conv2d"
self.use_cudnn = False
self.exhaustive_search = False
self.use_cuda = False
self.use_mkldnn = False
self.data_format = "AnyLayout"
self.weighttype = np.float32
self.use_mkldnn = True
self.init_group()
self.init_dilation()
self.init_test_case()
self.init_fuse_relu()
self.init_data_type()
conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
filter = np.random.random(self.filter_size).astype(self.weighttype)
if self.srctype == np.uint8:
input = np.random.randint(0, 10,
self.input_size).astype(self.srctype)
else:
input = np.random.randint(-5, 5,
self.input_size).astype(self.srctype)
input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
if self.srctype == np.int8:
filter_int = np.round(filter * self.scale_weights[0] *
0.5).astype(np.int32)
scale_output_shift = self.scale_out / (self.scale_in *
self.scale_weights[0] * 0.5)
output1 = conv2d_forward_refer(
np.round((input.astype(np.int32) + input_shift) *
self.scale_in).astype(np.int32), filter_int,
self.groups,
conv2d_param).astype(np.float32) * scale_output_shift
output2 = conv2d_forward_refer(
np.round((input_shift) * self.scale_in).astype(np.int32),
filter_int, self.groups,
conv2d_param).astype(np.float32) * scale_output_shift
if self.fuse_relu:
output = np.maximum(np.round(output1 - output2),
0).astype(self.dsttype)
else:
output = np.round(output1 - output2).astype(self.dsttype)
else:
filter_int = np.round(filter *
self.scale_weights[0]).astype(np.int32)
scale_output_shift = self.scale_out / (self.scale_in *
self.scale_weights[0])
output1 = conv2d_forward_refer(
input.astype(np.int32), filter_int, self.groups,
conv2d_param).astype(np.float32)
if self.fuse_relu:
output = np.maximum(
np.round(output1 * (self.scale_out / (
self.scale_in * self.scale_weights[0]))),
0).astype(self.dsttype)
else:
output = np.round(output1 * (self.scale_out / (
self.scale_in *
self.scale_weights[0]))).astype(self.dsttype)
self.inputs = {
'Input':
OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
self.attrs = {
'strides': self.stride,
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search,
'Scale_in': self.scale_in,
'Scale_out': self.scale_out,
'Scale_weights': self.scale_weights,
'fuse_relu': self.fuse_relu
}
self.outputs = {'Output': output}
def test_check_output(self):
self.check_output_with_place(core.CPUPlace(), atol=0)
def test_check_grad(self):
pass
def test_check_grad_no_filter(self):
pass
def test_check_grad_no_input(self):
pass
def init_test_case(self):
TestConv2dOp.init_test_case(self)
f_c = self.input_size[1] // self.groups
self.filter_size = [1, f_c, 3, 3]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [10.0]
def init_data_type(self):
self.srctype = np.uint8
self.dsttype = np.int8
def init_fuse_relu(self):
self.fuse_relu = True
#--------------------test conv2d u8 in and u8 out--------------------
class TestConv2d(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [10.0]
class TestWithPad(TestConv2d):
def init_test_case(self):
TestConv2d.init_test_case(self)
self.pad = [1, 1]
class TestWithGroup(TestConv2d):
def init_group(self):
self.groups = 3
class TestWithStride(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.scale_in = 1.0
self.scale_out = 0.8
self.scale_weights = [10.0]
class TestWith1x1(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [1, 3, 5, 5]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 1, 1]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [12.0]
class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 1, 1]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 1, 1]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [10.0]
def init_group(self):
self.groups = 3
def init_data_type_with_fusion(self, input_dt, fuse_relu):
self.srctype = input_dt
self.dsttype = np.uint8 if fuse_relu else np.int8
def init_fuse_relu(self):
self.fuse_relu = fuse_relu
def create_test_int8_class(parent):
#--------------------test conv2d s8 in and u8 out--------------------
class TestS8U8Case(parent):
def init_data_type(self):
init_data_type_with_fusion(self, np.int8, True)
#--------------------test conv2d s8 in and s8 out--------------------
class TestS8S8Case(parent):
def init_data_type(self):
init_data_type_with_fusion(self, np.int8, False)
#--------------------test conv2d u8 in and s8 out--------------------
class TestU8S8Case(parent):
def init_data_type(self):
init_data_type_with_fusion(self, np.uint8, False)
cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1")
cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
TestS8U8Case.__name__ = cls_name_s8u8
TestS8S8Case.__name__ = cls_name_s8s8
TestU8S8Case.__name__ = cls_name_u8s8
globals()[cls_name_s8u8] = TestS8U8Case
globals()[cls_name_s8s8] = TestS8S8Case
globals()[cls_name_u8s8] = TestU8S8Case
create_test_int8_class(TestConv2dInt8Op)
create_test_int8_class(TestWithPad)
create_test_int8_class(TestWithStride)
create_test_int8_class(TestWithGroup)
create_test_int8_class(TestWith1x1)
create_test_int8_class(TestWithInput1x1Filter1x1)
if __name__ == '__main__':
unittest.main()
...@@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): ...@@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
np.sum(input_pad_masked * f_sub[k, :, :, :], np.sum(input_pad_masked * f_sub[k, :, :, :],
axis=(1, 2, 3)) axis=(1, 2, 3))
return out return out, in_n, out_h, out_w, out_c
class TestConv2dOp(OpTest): class TestConv2dOp(OpTest):
...@@ -85,8 +85,9 @@ class TestConv2dOp(OpTest): ...@@ -85,8 +85,9 @@ class TestConv2dOp(OpTest):
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv2d_forward_naive(input, filter, self.groups, output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
conv2d_param).astype(self.dtype) conv2d_param)
output = output.astype(self.dtype)
self.inputs = { self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input), 'Input': OpTest.np_dtype_to_fluid_dtype(input),
......
...@@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase): ...@@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase):
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f"
tr0_cmd = tr_cmd % \ tr0_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
0, w0_ep, self._lr / 2) 0, w0_ep, self._lr)
tr1_cmd = tr_cmd % \ tr1_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
1, w1_ep, self._lr / 2) 1, w1_ep, self._lr)
if self._mem_opt: if self._mem_opt:
tr0_cmd += " --mem_opt" tr0_cmd += " --mem_opt"
......
...@@ -14,14 +14,15 @@ ...@@ -14,14 +14,15 @@
from __future__ import print_function from __future__ import print_function
import traceback
import math import math
import collections
import six
import unittest import unittest
import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.transpiler.distribute_transpiler import delete_ops
import traceback
import collections
import six
class TranspilerTest(unittest.TestCase): class TranspilerTest(unittest.TestCase):
...@@ -520,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): ...@@ -520,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat' 'recv', 'fetch_barrier'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
...@@ -560,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -560,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
'lookup_table_grad', 'split_selected_rows', 'send', 'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier', 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
'recv', 'recv', 'recv', 'fetch_barrier', 'concat' 'recv', 'recv', 'fetch_barrier'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
startup_ops = [ startup_ops = [
...@@ -607,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): ...@@ -607,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv', 'sum', 'split_selected_rows', 'send', 'recv', 'recv'
'recv', 'concat', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
...@@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send', 'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv', 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv'
'recv', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
startup_ops = [ startup_ops = [
...@@ -824,5 +823,142 @@ class TestRemoteLookupTable(TestDistLookupTableBase): ...@@ -824,5 +823,142 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
# test for remote prefetch
class TestRemoteNce(TestDistLookupTableBase):
def network_with_table(self, is_sparse, is_distributed):
num_total_classes = 20
sampler = "uniform"
nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
input = fluid.layers.data(name="input", shape=[10], dtype="float32")
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
w_param = fluid.default_main_program().global_block().create_parameter(
shape=[num_total_classes, 10],
dtype='float32',
name='nce_w',
initializer=fluid.initializer.ConstantInitializer())
b_param = fluid.default_main_program().global_block().create_parameter(
shape=[num_total_classes, 1],
dtype='float32',
name='nce_b',
initializer=fluid.initializer.ConstantInitializer())
cost = fluid.layers.nce(input=input,
label=label,
num_total_classes=num_total_classes,
sampler=sampler,
custom_dist=nid_freq_arr.tolist(),
sample_weight=None,
param_attr='nce_w',
bias_attr='nce_b',
seed=1,
num_neg_samples=5,
is_sparse=is_sparse)
avg_cost = fluid.layers.mean(cost)
# optimizer
optimizer = fluid.optimizer.Adam(learning_rate=0.003)
optimizer.minimize(avg_cost)
def net_conf(self):
import os
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
self.network_with_table(is_sparse=True, is_distributed=False)
def transpiler_test_impl(self):
trainer, _ = self.get_trainer()
out_vars = ["nce_w"]
in_vars = ["nce_b"]
recv_var_names = []
for op in trainer.blocks[0].ops:
if op.type == "recv":
for var in op.output("Out"):
recv_var_names.append(var)
for out_var in out_vars:
self.assertFalse(out_var in recv_var_names)
for in_var in in_vars:
self.assertTrue(in_var in recv_var_names)
# test for remote prefetch
class TestRemoteHsigmoid(TestDistLookupTableBase):
def network_with_table(self, is_sparse, is_distributed):
num_total_classes = 3
input = fluid.layers.data(name="input", shape=[1], dtype="float32")
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
path_table = fluid.layers.data(
name='path_table', shape=[3], dtype='int64')
path_code = fluid.layers.data(
name='path_code', shape=[3], dtype='int64')
w_param = fluid.default_main_program().global_block().create_parameter(
shape=[num_total_classes, 10],
dtype='float32',
name='hs_w',
initializer=fluid.initializer.ConstantInitializer())
b_param = fluid.default_main_program().global_block().create_parameter(
shape=[3, 1],
dtype='float32',
name='hs_b',
initializer=fluid.initializer.ConstantInitializer())
emb = fluid.layers.embedding(
input=input,
is_sparse=is_sparse,
size=[3, 3],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(num_total_classes))))
cost = fluid.layers.hsigmoid(
input=emb,
label=label,
num_classes=num_total_classes,
path_table=path_table,
path_code=path_code,
is_custom=True,
is_sparse=is_sparse)
avg_cost = fluid.layers.mean(cost)
# optimizer
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
optimizer.minimize(avg_cost)
def net_conf(self):
import os
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
self.network_with_table(is_sparse=True, is_distributed=False)
def transpiler_test_impl(self):
trainer, _ = self.get_trainer()
params_to_check = list()
for op in trainer.blocks[0].ops:
if op.type == "hierarchical_sigmoid":
params_to_check = [op.input("W")[0], op.input("Bias")[0]]
for name in ["epmap", "table_names", "epmap"]:
assert op.has_attr(name)
if name == "epmap":
assert op.attr(name)[0] == u'127.0.0.1:6174'
elif name == "table_names":
assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0'
else:
assert op.attr(name) == 3
elif op.type == "lookup_table":
params_to_check.append(op.input("W")[0])
else:
pass
op_count = 0
for op in trainer.blocks[0].ops:
if op.type == "recv":
assert len(op.output("Out")) == 1
assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0'
op_count += 1
assert op_count == 1
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): ...@@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
print('Skip use_cuda=True because Paddle is not compiled with cuda') print('Skip use_cuda=True because Paddle is not compiled with cuda')
return return
if use_parallel_executor and os.name == 'nt':
print(
'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
)
return
word_dict = paddle.dataset.imdb.word_dict() word_dict = paddle.dataset.imdb.word_dict()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.imdb.train(word_dict), batch_size=batch_size) paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid.op import Operator
import paddle.compat as cpt
class TestFusedEmbeddingSeqPoolOp(OpTest):
def setUp(self):
self.op_type = "fused_embedding_seq_pool"
self.emb_size = 2
table = np.random.random((17, self.emb_size)).astype("float32")
ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]],
[[16], [1]]]).astype("int64")
merged_ids = np.array([4, 2, 16]).astype("int64")
ids_expand = np.expand_dims(ids, axis=1)
self.lod = [[3, 1]]
self.attrs = {'is_sparse': True}
self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)}
self.outputs = {
'Out': np.reshape(
np.array([
table[[4, 3]] + table[[4, 3]] + table[[2, 1]],
table[[16, 1]]
]), [len(self.lod[0]), 2 * self.emb_size])
}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
...@@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest): ...@@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest):
self.inputs = { self.inputs = {
'X': x, 'X': x,
'W': w, 'W': w,
'PTable': path_table, 'PathTable': path_table,
'PathCode': path_code, 'PathCode': path_code,
'Label': label, 'Label': label,
'Bias': bias 'Bias': bias
...@@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): ...@@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest):
self.inputs = { self.inputs = {
'X': x, 'X': x,
'W': w, 'W': w,
'PTable': path_table, 'PathTable': path_table,
'PathCode': path_code, 'PathCode': path_code,
'Label': label, 'Label': label,
'Bias': bias 'Bias': bias
...@@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): ...@@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
self.inputs = { self.inputs = {
'X': x, 'X': x,
'W': w, 'W': w,
'PTable': path_table, 'PathTable': path_table,
'PathCode': path_code, 'PathCode': path_code,
'Label': label, 'Label': label,
} }
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import signal
import time
import unittest
from multiprocessing import Process
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.framework import Program, program_guard
def run_pserver(pserver_id, use_cuda, sync_mode):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
# create table parameter in scope
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# create and initialize Param Variable
param = scope.var('table').get_tensor()
param_array = np.ones((5, 8)).astype("float32")
for i in range(len(param_array)):
param_array[i] *= param_array[i] * i + pserver_id * 10 + 1
param.set(param_array, place)
optimize_block = program._create_block(program.global_block().idx)
program.global_block().append_op(
type="listen_and_serv",
inputs={'X': []},
outputs={},
attrs={
"optimize_blocks": [optimize_block],
"endpoint": '127.0.0.1:0',
"Fanin": 1,
"sync_mode": True,
"grad_to_block_id": []
})
exe = fluid.Executor(place)
exe.run(program)
class TestListenAndServOp(unittest.TestCase):
def setUp(self):
self.ps_timeout = 5
def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
p.daemon = True
p.start()
return p
def _wait_ps_ready(self, pid):
start_left_time = self.ps_timeout
sleep_time = 0.5
while True:
assert start_left_time >= 0, "wait ps ready failed"
time.sleep(sleep_time)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
start_left_time -= sleep_time
def _get_pserver_port(self, pid):
with open("/tmp/paddle.%d.port" % pid, 'r') as f:
port = int(f.read().strip())
return port
def _run_hsigmoid_op_one_pserver(self, place, port):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
x = scope.var('X').get_tensor()
x_array = np.random.random((4, 8)).astype("float32") * 2
x.set(x_array, place)
# create and initialize Param Variable
param = scope.var('W').get_tensor()
param_array = np.zeros((5, 8)).astype("float32") * 2
param.set(param_array, place)
path_table = scope.var('PathTable').get_tensor()
path_table_array = np.array(
[(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1),
(0, 2, -1, -1, -1)]).astype(
"int64"
) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_table.set(path_table_array, place)
path_code = scope.var('PathCode').get_tensor()
path_code_array = np.array(
[(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
(0, 1, -1, -1, -1)]).astype("int64") #np.array to store
path_code.set(path_code_array, place)
label = scope.var('Label').get_tensor()
label_array = np.array([0, 1, 4, 5])
label.set(label_array, place)
bias = scope.var('Bias').get_tensor()
bias_array = np.random.random((5, 1)).astype("float32")
bias.set(bias_array, place)
out = scope.var('Out').get_tensor()
pre_out = scope.var('PreOut').get_tensor
w_out = scope.var('W_Out').get_tensor()
w_out.set(param_array, place)
emaps = ['127.0.0.1:' + str(port)]
table_names = ['table']
height_sections = [2]
# create and run sgd operator
hsigmoid_op = Operator(
"hierarchical_sigmoid",
X='X',
W='W',
PathTable='PathTable',
PathCode='PathCode',
Label='Label',
Bias='Bias',
Out='Out',
PreOut='PreOut',
W_Out='W_Out',
remote_prefetch=True,
epmap=emaps,
table_names=table_names,
height_sections=height_sections)
hsigmoid_op.run(scope, place)
# get and compare result
result_array = np.array(w_out)
self.assertEqual(list(result_array.shape), [5, 8])
correct = None
for i in range(5):
if i != 3:
correct = np.full((1, 8), i + 1).astype("float32")
self.assertTrue((result_array[i] == correct).all())
else:
correct = np.full((1, 8), 0).astype("float32")
self.assertTrue((result_array[i] == correct).all())
def _run_hsigmoid_op_two_pserver(self, place, port0, port1):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
x = scope.var('X').get_tensor()
x_array = np.random.random((4, 8)).astype("float32") * 2
x.set(x_array, place)
# create and initialize Param Variable
param = scope.var('W').get_tensor()
param_array = np.zeros((5, 8)).astype("float32") * 2
param.set(param_array, place)
path_table = scope.var('PathTable').get_tensor()
path_table_array = np.array(
[(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
(0, 2, -1, -1, -1)]).astype(
"int64"
) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_table.set(path_table_array, place)
path_code = scope.var('PathCode').get_tensor()
path_code_array = np.array(
[(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
(0, 1, -1, -1, -1)]).astype("int64") #np.array to store
path_code.set(path_code_array, place)
label = scope.var('Label').get_tensor()
label_array = np.array([0, 1, 4, 5])
label.set(label_array, place)
bias = scope.var('Bias').get_tensor()
bias_array = np.random.random((5, 1)).astype("float32")
bias.set(bias_array, place)
out = scope.var('Out').get_tensor()
pre_out = scope.var('PreOut').get_tensor
w_out = scope.var('W_Out').get_tensor()
w_out.set(param_array, place)
emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
table_names = ['table', 'table']
height_sections = [2, 3]
# create and run sgd operator
hsigmoid_op = Operator(
"hierarchical_sigmoid",
X='X',
W='W',
PathTable='PathTable',
PathCode='PathCode',
Label='Label',
Bias='Bias',
Out='Out',
PreOut='PreOut',
W_Out='W_Out',
remote_prefetch=True,
epmap=emaps,
table_names=table_names,
height_sections=height_sections)
hsigmoid_op.run(scope, place)
# get and compare result
result_array = np.array(w_out)
self.assertEqual(list(result_array.shape), [5, 8])
correct = None
for i in range(5):
if i < 2:
correct = np.full((1, 8), i + 1).astype("float32")
self.assertTrue((result_array[i] == correct).all())
else:
correct = np.full((1, 8), i + 9).astype("float32")
self.assertTrue((result_array[i] == correct).all())
def test_hsigmoid_op_remote(self):
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
# run pserver on CPU in sync mode
p0 = self._start_pserver(0, False, True, run_pserver)
self._wait_ps_ready(p0.pid)
port0 = self._get_pserver_port(p0.pid)
p1 = self._start_pserver(1, False, True, run_pserver)
self._wait_ps_ready(p1.pid)
port1 = self._get_pserver_port(p1.pid)
places = [core.CPUPlace()]
for place in places:
self._run_hsigmoid_op_one_pserver(place, port0)
self._run_hsigmoid_op_two_pserver(place, port0, port1)
# raise SIGTERM to pserver
os.kill(p0.pid, signal.SIGINT)
p0.join()
os.kill(p1.pid, signal.SIGINT)
p1.join()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import signal
import time
import unittest
from multiprocessing import Process
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.framework import Program, program_guard
def nce(input, weight, bias, sample_weight, labels, num_classes,
num_sample_class):
samples = []
sample_labels = []
batch_size = input.shape[0]
num_true_class = labels.shape[1]
for i in range(batch_size):
w = 1 if sample_weight is None else sample_weight[i]
for label in labels[i]:
samples.append((i, label, True, w))
sample_labels.append(label)
for num in range(num_sample_class):
samples.append((i, num, False, w))
sample_labels.append(num)
# forward bias
sample_out = np.zeros(len(samples)).astype(np.float32)
if bias is not None:
for i in range(len(samples)):
sample_out[i] = bias[samples[i][1]]
# forward weight
for i in range(len(samples)):
sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
# forward activation
sample_out = 1.0 / (1.0 + np.exp(-sample_out))
# forward cost
out = np.zeros(batch_size).astype(np.float32)
b = 1.0 / num_classes * num_sample_class
for i in range(len(samples)):
o = sample_out[i]
cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
out[samples[i][0]] += cost * samples[i][3]
return (out[:, np.newaxis], np.array(sample_out).reshape(
batch_size, num_sample_class + num_true_class),
np.array(sample_labels).reshape(batch_size,
num_sample_class + num_true_class))
def run_pserver(pserver_id, use_cuda, sync_mode):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
# create table parameter in scope
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# create and initialize Param Variable
param = scope.var('table').get_tensor()
param_array = np.ones((5, 8)).astype("float32")
for i in range(len(param_array)):
param_array[i] *= param_array[i] * i + pserver_id * 10 + 1
param.set(param_array, place)
optimize_block = program._create_block(program.global_block().idx)
program.global_block().append_op(
type="listen_and_serv",
inputs={'X': []},
outputs={},
attrs={
"optimize_blocks": [optimize_block],
"endpoint": '127.0.0.1:0',
"Fanin": 1,
"sync_mode": True,
"grad_to_block_id": []
})
exe = fluid.Executor(place)
exe.run(program)
class TestListenAndServOp(unittest.TestCase):
def setUp(self):
self.ps_timeout = 5
def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
p.daemon = True
p.start()
return p
def _wait_ps_ready(self, pid):
start_left_time = self.ps_timeout
sleep_time = 0.5
while True:
assert start_left_time >= 0, "wait ps ready failed"
time.sleep(sleep_time)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
start_left_time -= sleep_time
def _get_pserver_port(self, pid):
with open("/tmp/paddle.%d.port" % pid, 'r') as f:
port = int(f.read().strip())
return port
def _run_nce_op_two_pserver(self, place, port0, port1):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
x = scope.var('Input').get_tensor()
x_array = np.random.random((4, 8)).astype("float32")
x.set(x_array, place)
# create and initialize Param Variable
param = scope.var('Weight').get_tensor()
param_array = np.zeros((5, 8)).astype("float32")
param.set(param_array, place)
bias = scope.var('Bias').get_tensor()
bias_array = np.random.random((5, 1)).astype("float32")
bias.set(bias_array, place)
sample_w = scope.var('SampleWeight').get_tensor()
sample_weight = np.random.random((4, 1)).astype("float32")
sample_w.set(sample_weight, place)
label = scope.var('Label').get_tensor()
label_array = np.array([[0], [1], [4], [3]])
label.set(label_array, place)
cost = scope.var('Cost').get_tensor()
cost_w = np.zeros((4, 1)).astype("float32")
cost.set(cost_w, place)
sample_l = scope.var('SampleLogits').get_tensor()
sample_l_w = np.zeros((4, 3)).astype("float32")
sample_l.set(sample_l_w, place)
sample_la = scope.var('SampleLabels').get_tensor()
sample_la_w = np.zeros((4, 3)).astype("int")
sample_la.set(sample_la_w, place)
emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
table_names = ['table', 'table']
height_sections = [2, 3]
# create and run nce operator
nce_op = Operator(
"nce",
Input='Input',
Weight='Weight',
Label='Label',
Bias='Bias',
Cost='Cost',
SampleLogits='SampleLogits',
SampleLabels='SampleLabels',
SampleWeight='SampleWeight',
num_total_classes=5,
num_neg_samples=2,
custom_neg_classes=list(range(2)),
sampler=0,
seed=0,
is_sparse=True,
remote_prefetch=True,
epmap=emaps,
table_names=table_names,
height_sections=height_sections)
nce_op.run(scope, place)
# get and compare result
o_cost = np.array(scope.var('Cost').get_tensor())
o_logits = np.array(scope.var('SampleLogits').get_tensor())
o_labels = np.array(scope.var('SampleLabels').get_tensor())
param_array = np.ones((5, 8)).astype("float32")
for i in range(2):
param_array[i] *= param_array[i] * i + 0 * 10 + 1
for i in range(2, 5):
param_array[i] *= param_array[i] * i + 1 * 10 + 1
out = nce(x_array, param_array, bias_array, sample_weight,
label_array, 5, 2)
self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6)
self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6)
self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6)
def test_nce_op_remote(self):
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
# run pserver on CPU in sync mode
p0 = self._start_pserver(0, False, True, run_pserver)
self._wait_ps_ready(p0.pid)
port0 = self._get_pserver_port(p0.pid)
p1 = self._start_pserver(1, False, True, run_pserver)
self._wait_ps_ready(p1.pid)
port1 = self._get_pserver_port(p1.pid)
places = [core.CPUPlace()]
for place in places:
self._run_nce_op_two_pserver(place, port0, port1)
# raise SIGTERM to pserver
os.kill(p0.pid, signal.SIGINT)
p0.join()
os.kill(p1.pid, signal.SIGINT)
p1.join()
if __name__ == '__main__':
unittest.main()
...@@ -175,41 +175,61 @@ class TestCRFModel(unittest.TestCase): ...@@ -175,41 +175,61 @@ class TestCRFModel(unittest.TestCase):
print(pe.run(feed=feeder.feed(cur_batch), print(pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name])[0]) fetch_list=[avg_cost.name])[0])
def test_update_sparse_parameter_all_reduce(self): def _new_build_strategy(self, use_reduce=False):
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
if use_reduce:
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
return build_strategy
def test_update_sparse_parameter_all_reduce(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_network_convergence( self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=True) is_sparse=True,
build_strategy=self._new_build_strategy(),
use_cuda=True)
self.check_network_convergence( self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=False) is_sparse=True,
build_strategy=self._new_build_strategy(),
use_cuda=False)
def test_update_dense_parameter_all_reduce(self): def test_update_dense_parameter_all_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_network_convergence( self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=True) is_sparse=False,
build_strategy=self._new_build_strategy(),
use_cuda=True)
self.check_network_convergence( self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=False) is_sparse=False,
build_strategy=self._new_build_strategy(),
use_cuda=False)
def test_update_sparse_parameter_reduce(self): def test_update_sparse_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_network_convergence( self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=True) is_sparse=True,
build_strategy=self._new_build_strategy(use_reduce=True),
use_cuda=True)
self.check_network_convergence( self.check_network_convergence(
is_sparse=True, build_strategy=build_strategy, use_cuda=False) is_sparse=True,
build_strategy=self._new_build_strategy(use_reduce=True),
use_cuda=False)
def test_update_dense_parameter_reduce(self): def test_update_dense_parameter_reduce(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_network_convergence( self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=True) is_sparse=False,
build_strategy=self._new_build_strategy(use_reduce=True),
use_cuda=True)
self.check_network_convergence( self.check_network_convergence(
is_sparse=False, build_strategy=build_strategy, use_cuda=False) is_sparse=False,
build_strategy=self._new_build_strategy(use_reduce=True),
use_cuda=False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -86,6 +86,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -86,6 +86,7 @@ class TestMNIST(TestParallelExecutorBase):
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=False) use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence( reduce_first_loss, reduce_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={"image": img,
......
...@@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase): ...@@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase):
exe.run(startup_prog) exe.run(startup_prog)
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
if with_double_buffer:
build_strategy.enable_data_balance = True
exec_strategy = fluid.ExecutionStrategy() exec_strategy = fluid.ExecutionStrategy()
parallel_exe = fluid.ParallelExecutor( parallel_exe = fluid.ParallelExecutor(
use_cuda=self.use_cuda, use_cuda=self.use_cuda,
......
...@@ -251,11 +251,10 @@ class DistributeTranspiler(object): ...@@ -251,11 +251,10 @@ class DistributeTranspiler(object):
def _get_all_remote_sparse_update_op(self, main_program): def _get_all_remote_sparse_update_op(self, main_program):
sparse_update_ops = [] sparse_update_ops = []
sparse_update_op_types = ["lookup_table"] sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"]
for op in main_program.global_block().ops: for op in main_program.global_block().ops:
if op.type in sparse_update_op_types and op.attr( if op.type in sparse_update_op_types and op.attr(
'remote_prefetch') is True and not op.attr( 'remote_prefetch') is True:
'is_distributed'):
sparse_update_ops.append(op) sparse_update_ops.append(op)
return sparse_update_ops return sparse_update_ops
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册