提交 869487a2 编写于 作者: P peizhilin

Merge remote-tracking branch 'origin/develop' into windows/build

...@@ -67,7 +67,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) ...@@ -67,7 +67,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
......
...@@ -50,11 +50,7 @@ if(NOT WITH_PROFILER) ...@@ -50,11 +50,7 @@ if(NOT WITH_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX512F_FOUND) if(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX512F_FLAG})
elseif(WITH_AVX AND AVX2_FOUND)
set(SIMD_FLAG ${AVX2_FLAG})
elseif(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG}) set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND) elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG}) set(SIMD_FLAG ${SSE3_FLAG})
......
...@@ -89,7 +89,9 @@ CHECK_CXX_SOURCE_RUNS(" ...@@ -89,7 +89,9 @@ CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m512i a = _mm512_undefined_epi32(); __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a);
return 0; return 0;
}" AVX512F_FOUND) }" AVX512F_FOUND)
......
...@@ -67,8 +67,8 @@ paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', ...@@ -67,8 +67,8 @@ paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size',
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...@@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's ...@@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
...@@ -174,9 +174,11 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -174,9 +174,11 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
...@@ -187,6 +189,7 @@ paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, k ...@@ -187,6 +189,7 @@ paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, k
paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
......
cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node) cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
...@@ -30,20 +31,25 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ ...@@ -30,20 +31,25 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
if(WITH_GPU) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
if (WITH_GPU)
cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
endif() endif()
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
if(WITH_GPU) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass) if (WITH_GPU)
else() list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
endif() endif()
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context) simple_threadpool device_context)
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
...@@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
public: public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) { : ir::PassBuilder(), strategy_(strategy) {
if (strategy_.enable_sequential_execution_) {
AppendPass("sequential_execution_pass");
}
// Add a graph viz pass to record a graph. // Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass"); auto viz_pass = AppendPass("graph_viz_pass");
...@@ -64,6 +69,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -64,6 +69,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Verify that the graph is correct for multi-device executor. // Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass"); AppendPass("multi_devices_check_pass");
if (strategy_.remove_unnecessary_lock_) {
AppendPass("modify_op_lock_and_record_event_pass");
}
} }
private: private:
...@@ -110,6 +119,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -110,6 +119,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Erase("nccl_ctxs"); pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif #endif
} else if (pass->Type() == "sequential_execution_pass") {
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} }
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
} }
...@@ -125,3 +139,5 @@ USE_PASS(multi_batch_merge_pass); ...@@ -125,3 +139,5 @@ USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass); USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
USE_PASS(sequential_execution_pass);
USE_PASS(modify_op_lock_and_record_event_pass);
...@@ -69,8 +69,12 @@ struct BuildStrategy { ...@@ -69,8 +69,12 @@ struct BuildStrategy {
bool enable_data_balance_{false}; bool enable_data_balance_{false};
bool enable_sequential_execution_{false};
bool fuse_broadcast_op_{false}; bool fuse_broadcast_op_{false};
bool remove_unnecessary_lock_{false};
// User normally doesn't need to call this API. // User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes // The PassBuilder allows for more customized insert, remove of passes
// from python side. // from python side.
......
...@@ -29,9 +29,15 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, ...@@ -29,9 +29,15 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
void ComputationOpHandle::RunImpl() { void ComputationOpHandle::RunImpl() {
WaitInputVarGenerated(place_); WaitInputVarGenerated(place_);
this->RunAndRecordEvent([this] { auto run_func = [this]() {
op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_); op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
}); };
if (is_lock_and_record_event_free_) {
run_func();
} else {
this->RunAndRecordEvent(run_func);
}
} }
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
......
...@@ -36,6 +36,8 @@ struct ComputationOpHandle : public OpHandleBase { ...@@ -36,6 +36,8 @@ struct ComputationOpHandle : public OpHandleBase {
const platform::Place &GetPlace() const { return place_; } const platform::Place &GetPlace() const { return place_; }
void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
protected: protected:
void RunImpl() override; void RunImpl() override;
...@@ -45,6 +47,7 @@ struct ComputationOpHandle : public OpHandleBase { ...@@ -45,6 +47,7 @@ struct ComputationOpHandle : public OpHandleBase {
std::unique_ptr<OperatorBase> op_; std::unique_ptr<OperatorBase> op_;
Scope *scope_; Scope *scope_;
platform::Place place_; platform::Place place_;
bool is_lock_and_record_event_free_{false};
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cstddef> // for size_t
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -26,6 +27,7 @@ struct ExecutionStrategy { ...@@ -26,6 +27,7 @@ struct ExecutionStrategy {
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{100};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false};
}; };
} // namespace details } // namespace details
......
...@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
size_t complete = 0; size_t complete = 0;
while (op_to_run != nullptr) { while (op_to_run != nullptr) {
try { try {
op_to_run->Run(strategy_.use_cuda_); if (LIKELY(!strategy_.dry_run_)) {
op_to_run->Run(strategy_.use_cuda_);
}
++complete; ++complete;
} catch (...) { } catch (...) {
exception_.Catch(std::current_exception()); exception_.Catch(std::current_exception());
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h"
namespace paddle {
namespace framework {
namespace details {
static bool IsLockAndRecordEventFreeComputationOpHandle(
ComputationOpHandle *op, const OpGraphView &graph_view) {
if (!platform::is_gpu_place(op->GetPlace())) return false;
for (auto &pending_op : graph_view.PendingOps(op)) {
auto *tmp = dynamic_cast<ComputationOpHandle *>(pending_op);
if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
return false;
}
}
return true;
}
std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
std::unique_ptr<ir::Graph> ir_graph) const {
auto &all_ops = ir_graph->Get<GraphOps>(kGraphOps);
OpGraphView graph_view(all_ops);
for (auto &op : all_ops) {
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
if (compute_op == nullptr) continue;
bool is_lock_and_record_event_free =
IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
if (is_lock_and_record_event_free) {
VLOG(10) << "Set is_lock_and_record_event_free be true in op "
<< compute_op->DebugString();
}
}
return ir_graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(modify_op_lock_and_record_event_pass,
paddle::framework::details::ModifyOpLockAndRecordEventPass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
class ModifyOpLockAndRecordEventPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/op_graph_view.h"
#include <queue>
#include <utility>
namespace paddle {
namespace framework {
namespace details {
OpGraphView::OpGraphView(
const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
Build(ops);
}
void OpGraphView::Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
for (auto &op : ops) {
preceding_ops_[op.get()];
pending_ops_[op.get()];
for (auto &var : op->Outputs()) {
for (auto &pending_op : var->PendingOps()) {
preceding_ops_[pending_op].insert(op.get());
pending_ops_[op.get()].insert(pending_op);
}
}
}
PADDLE_ENFORCE(
preceding_ops_.size() == ops.size() && pending_ops_.size() == ops.size(),
"There are duplicate ops in graph.");
}
size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); }
std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
std::unordered_set<OpHandleBase *> ret;
for (auto &pair : preceding_ops_) {
ret.insert(pair.first);
}
return ret;
}
bool OpGraphView::HasOp(OpHandleBase *op) const {
return preceding_ops_.count(op) != 0;
}
void OpGraphView::EnforceHasOp(OpHandleBase *op) const {
PADDLE_ENFORCE(HasOp(op), "Cannot find op %s in OpGraphView",
op == nullptr ? "nullptr" : op->DebugString());
}
const std::unordered_set<OpHandleBase *> &OpGraphView::PrecedingOps(
OpHandleBase *op) const {
EnforceHasOp(op);
return preceding_ops_.at(op);
}
const std::unordered_set<OpHandleBase *> &OpGraphView::PendingOps(
OpHandleBase *op) const {
EnforceHasOp(op);
return pending_ops_.at(op);
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
namespace paddle {
namespace framework {
namespace details {
class OpGraphView {
public:
explicit OpGraphView(const std::vector<std::unique_ptr<OpHandleBase>> &ops);
size_t OpNumber() const;
std::unordered_set<OpHandleBase *> AllOps() const;
const std::unordered_set<OpHandleBase *> &PrecedingOps(
OpHandleBase *op) const;
const std::unordered_set<OpHandleBase *> &PendingOps(OpHandleBase *op) const;
bool HasOp(OpHandleBase *op) const;
private:
void Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops);
void EnforceHasOp(OpHandleBase *op) const;
std::unordered_map<OpHandleBase *, std::unordered_set<OpHandleBase *>>
preceding_ops_;
std::unordered_map<OpHandleBase *, std::unordered_set<OpHandleBase *>>
pending_ops_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -51,7 +51,7 @@ class ReferenceCountOpHandle : public OpHandleBase { ...@@ -51,7 +51,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
dev_ctx_ = static_cast<platform::CUDADeviceContext *>( dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
if (IsStreamGarabageCollector()) { if (IsStreamGarabageCollector()) {
PADDLE_ENFORCE(cudaSetDevice(place.device)); platform::SetDeviceId(place.device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
} }
...@@ -61,7 +61,7 @@ class ReferenceCountOpHandle : public OpHandleBase { ...@@ -61,7 +61,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
~ReferenceCountOpHandle() { ~ReferenceCountOpHandle() {
if (IsStreamGarabageCollector()) { if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); platform::SetDeviceId(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_)); PADDLE_ENFORCE(cudaEventDestroy(event_));
} }
} }
......
...@@ -43,6 +43,23 @@ static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) { ...@@ -43,6 +43,23 @@ static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
return nullptr; return nullptr;
} }
static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
ir::Graph *graph) {
auto it = std::find_if(
in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
return dynamic_cast<DummyVarHandle *>(var) != nullptr;
});
if (it != in->Outputs().end()) {
out->AddInput(*it);
} else {
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
in->AddOutput(dep_var);
out->AddInput(dep_var);
}
}
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount); auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
...@@ -133,12 +150,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -133,12 +150,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
auto *ref_cnt_handle = new ReferenceCountOpHandle( auto *ref_cnt_handle = new ReferenceCountOpHandle(
ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
gcs[place.device].get(), cur_ref_cnts[place.device].get()); gcs[place.device].get(), cur_ref_cnts[place.device].get());
if (next_compute_op->Outputs().empty()) { AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
next_compute_op->AddOutput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
ref_cnt_handle->AddInput(next_compute_op->Outputs().front());
compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle); compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle);
} }
} }
...@@ -160,12 +172,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -160,12 +172,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
auto *ref_cnt_handle = new ReferenceCountOpHandle( auto *ref_cnt_handle = new ReferenceCountOpHandle(
ref_cnt_node, compute_op->GetScope(), place, in_var_names, ref_cnt_node, compute_op->GetScope(), place, in_var_names,
gcs[place.device].get(), cur_ref_cnts[place.device].get()); gcs[place.device].get(), cur_ref_cnts[place.device].get());
if (compute_op->Outputs().empty()) { AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
compute_op->AddOutput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
ref_cnt_handle->AddInput(compute_op->Outputs().front());
compute_ref_cnt_map[compute_op].reset(ref_cnt_handle); compute_ref_cnt_map[compute_op].reset(ref_cnt_handle);
} }
......
...@@ -29,22 +29,19 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc, ...@@ -29,22 +29,19 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
place_(place) {} place_(place) {}
void RPCOpHandle::RunImpl() { void RPCOpHandle::RunImpl() {
// TODO(wuyi): need further analysis whether wait VarDummyHandle.
// Wait input done
for (auto *in : inputs_) { for (auto *in : inputs_) {
auto &p = static_cast<VarHandle *>(in)->place_; auto &p = static_cast<VarHandle *>(in)->place_;
// FIXME(Yancey1989): need a better solution instead of use DebugString() if (ir::IsControlDepVar(*in->Node())) {
if (ir::IsControlDepVar(*in->Node())) { // HACK
continue; continue;
} }
if (in->GeneratedOp()) { if (in->GeneratedOp()) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p)); in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
} }
} }
auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); this->RunAndRecordEvent([this] {
// FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead op_->Run(*local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(),
// lock. place_);
op_->Run(*tmp_scope, place_); });
} }
std::string RPCOpHandle::Name() const { return name_; } std::string RPCOpHandle::Name() const { return name_; }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace details {
static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
// FIXME(zjl): Insert dependencies between some distributed ops may cause
// the multi_devices_graph_pass fails. So we skip these ops here.
// Indeed, maybe we should not insert dependencies between these ops
// casually, which may cause deadlock easily.
// We should add more skipped distributed ops when found errors in
// multi_devices_graph_pass
static std::unordered_set<std::string> skip_dist_ops{
"send", "recv", "send_barrier", "fetch_barrier"};
auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
std::vector<ir::Node *> op_node_list;
op_node_list.reserve(ops.size());
std::unordered_map<ir::Node *, size_t> op_deps;
std::unordered_map<ir::Node *, std::unordered_set<ir::Node *>> pending_ops;
std::unordered_set<ir::Node *> ready_ops;
for (ir::Node *node : graph->Nodes()) {
if (!node->IsOp()) continue;
std::unordered_set<ir::Node *> preceding_ops;
for (auto *in : node->inputs) {
PADDLE_ENFORCE(in->IsVar(),
"Preceding Node of Op Nodes must be Var Node");
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
preceding_ops.insert(in->inputs[0]);
pending_ops[in->inputs[0]].insert(node);
}
op_deps[node] = preceding_ops.size();
if (preceding_ops.empty()) {
ready_ops.insert(node);
}
}
for (auto *op_desc : ops) {
ir::Node *found_node = nullptr;
for (auto *node : ready_ops) {
if (IsSameOpDesc(op_desc, node->Op())) {
PADDLE_ENFORCE(found_node == nullptr,
"Found multiple op_desc in graph: %s", op_desc->Type());
found_node = node;
}
}
PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
op_desc->Type());
for (auto *pending_op : pending_ops[found_node]) {
if (--op_deps.at(pending_op) == 0) {
ready_ops.insert(pending_op);
}
}
ready_ops.erase(found_node);
if (skip_dist_ops.count(op_desc->Type()) == 0) {
op_node_list.push_back(found_node);
}
}
for (size_t i = 1; i < op_node_list.size(); ++i) {
auto *dep_var = graph->CreateControlDepVar();
op_node_list[i]->inputs.push_back(dep_var);
op_node_list[i - 1]->outputs.push_back(dep_var);
dep_var->outputs.push_back(op_node_list[i]);
dep_var->inputs.push_back(op_node_list[i - 1]);
VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name();
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(sequential_execution_pass,
paddle::framework::details::SequentialExecutionPass)
.RequirePassAttr(paddle::framework::details::kAllOpDescs);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
class SequentialExecutionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
} }
op->Run(strategy_.use_cuda_); if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_);
}
VLOG(10) << op << " " << op->Name() << " Done "; VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
......
...@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
// Use topological sort algorithm // Use topological sort algorithm
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override; FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
~ThreadedSSAGraphExecutor() {} ~ThreadedSSAGraphExecutor() final = default;
private: private:
void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q, void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
......
...@@ -85,8 +85,10 @@ Executor::Executor(const platform::Place& place) : place_(place) {} ...@@ -85,8 +85,10 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
// TODO(typhoonzero): complete message will need to use real trainer_id,
// except 0.
::paddle::operators::distributed::RPCClient::GetInstance< ::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::distributed::GRPCClient>() ::paddle::operators::distributed::GRPCClient>(0)
->SendComplete(); ->SendComplete();
#endif #endif
} }
......
...@@ -41,6 +41,7 @@ pass_library(conv_bn_fuse_pass inference) ...@@ -41,6 +41,7 @@ pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference)
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base)
pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_bias_mkldnn_fuse_pass inference)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
...@@ -59,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph ...@@ -59,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif () endif ()
...@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
virtual ~ConvReLUFusePass() {} virtual ~ConvReLUFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
op->SetOutput("Out", outputs); op->SetOutput("Out", outputs);
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
} }
// a->OP0->b // a->OP0->b
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_NODE(id, pattern) \
PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(pattern.RetrieveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
pattern->NewNode("depthwise_conv")
->assert_is_op("depthwise_conv2d")
->assert_op_attr("use_mkldnn", true);
int found_depthwise_conv_mkldnn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
GET_NODE(depthwise_conv, (*pattern));
depthwise_conv->Op()->SetType("conv2d");
found_depthwise_conv_mkldnn_count++;
};
gpd(graph.get(), handler);
AddStatis(found_depthwise_conv_mkldnn_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(depthwise_conv_mkldnn_pass,
paddle::framework::ir::DepthwiseConvMKLDNNPass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class DepthwiseConvMKLDNNPass : public FusePassBase {
public:
virtual ~DepthwiseConvMKLDNNPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn = false) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("use_mkldnn", use_mkldnn);
op->SetAttr("name", name);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
op->SetInput("Bias", {inputs[2]});
op->SetOutput("Out", outputs);
}
// (a, weights, bias)->depthwise conv mkldnn->b
// (b, weights2, bias2)->depthwise conv no mkldnn->c
// (c, weights3, bias3)->conv mkldnn->d
// (d, weights3, bias3)->conv no mkldnn->e
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v : std::vector<std::string>(
{"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
"weights3", "bias3", "weights4", "bias4"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
var->SetPersistable(true);
}
}
// depthwise conv with MKL-DNN
SetOp(&prog, "depthwise_conv2d", "conv1",
std::vector<std::string>({"a", "weights", "bias"}),
std::vector<std::string>({"b"}), true);
// depthwise conv without MKL-DNN
SetOp(&prog, "depthwise_conv2d", "conv2",
std::vector<std::string>({"b", "weights2", "bias2"}),
std::vector<std::string>({"c"}), false);
// conv with MKL-DNN
SetOp(&prog, "conv2d", "conv3",
std::vector<std::string>({"c", "weights3", "bias3"}),
std::vector<std::string>({"d"}), true);
// conv without MKL-dNN
SetOp(&prog, "conv2d", "conv4",
std::vector<std::string>({"d", "weights4", "bias4"}),
std::vector<std::string>({"e"}), false);
return prog;
}
TEST(DepthwiseConvMKLDNNPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
struct counters {
int mkldnn_depthwise_conv_nodes;
int other_depthwise_conv_nodes;
int mkldnn_conv_nodes;
int other_conv_nodes;
};
counters before{1, 1, 1, 1};
graph = pass->Apply(std::move(graph));
// initialize counters before loop
counters after{0, 0, 0, 0};
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->Type() == "conv2d") {
if (boost::get<bool>(op->GetAttr("use_mkldnn")))
after.mkldnn_conv_nodes++;
else
after.other_conv_nodes++;
} else if (op->Type() == "depthwise_conv2d") {
if (boost::get<bool>(op->GetAttr("use_mkldnn")))
after.mkldnn_depthwise_conv_nodes++;
else
after.other_depthwise_conv_nodes++;
}
}
}
EXPECT_EQ(after.other_depthwise_conv_nodes,
before.other_depthwise_conv_nodes);
EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
before.mkldnn_depthwise_conv_nodes - 1);
EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(depthwise_conv_mkldnn_pass);
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
op->SetOutput("Out", outputs); op->SetOutput("Out", outputs);
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
} }
// a->OP0->b // a->OP0->b
......
...@@ -23,8 +23,67 @@ limitations under the License. */ ...@@ -23,8 +23,67 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace {
void CheckProgram(const ProgramDesc &program) {
#define _INT(role) static_cast<int>(role)
std::map<int, bool> visit;
for (OpDesc *op : program.Block(0).AllOps()) {
// For backward compatibility, some program doesn't have role added.
if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
int role_id =
boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
visit[role_id] = true;
switch (role_id) {
case _INT(OpRole::kForward):
if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
LOG(ERROR)
<< "Cannot add backward operator before forward operator %s."
<< op->Type();
}
break;
case _INT(OpRole::kBackward):
case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add backward operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
_INT(OpRole::kLoss)) == visit.end(),
"Cannot add backward|loss operator before "
"forward|loss operator %s.",
op->Type());
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add forward|loss operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kOptimize):
case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
"Optimize operators %s must follow backward operator.",
op->Type());
break;
case _INT(OpRole::kLRSched):
case _INT(OpRole::kDist):
case _INT(OpRole::kRPC):
case _INT(OpRole::kNotSpecified):
break;
default:
LOG(FATAL) << "Unknown operator role. Don't add new role because "
"you don't know what you are doing.";
}
}
#undef _INT
}
} // namespace
Graph::Graph(const ProgramDesc &program) : program_(program) { Graph::Graph(const ProgramDesc &program) : program_(program) {
CheckProgram(program_);
// Make the nodes id start from 0. // Make the nodes id start from 0.
Node::ResetId(); Node::ResetId();
auto var_nodes = InitFromProgram(program_); auto var_nodes = InitFromProgram(program_);
......
...@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() { ...@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
return result; return result;
} }
bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
const std::pair<PDNode *, Node *> &b) {
if (a.first != b.first) {
return a.first < b.first;
} else {
return a.second < b.second;
}
}
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550 // see https://github.com/PaddlePaddle/Paddle/issues/13550
void GraphPatternDetector::UniquePatterns( void GraphPatternDetector::UniquePatterns(
...@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns( ...@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
std::vector<GraphPatternDetector::subgraph_t> result; std::vector<GraphPatternDetector::subgraph_t> result;
std::unordered_set<size_t> set; std::unordered_set<size_t> set;
std::hash<std::string> hasher;
for (auto &g : *subgraphs) { for (auto &g : *subgraphs) {
size_t key = 0; // Sort the items in the sub-graph, and transform to a string key.
for (auto &item : g) { std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
key ^= std::hash<void *>{}(item.first); std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
key ^= std::hash<void *>{}(item.second); std::stringstream ss;
for (auto &item : sorted_keys) {
ss << item.first << ":" << item.second;
} }
auto key = hasher(ss.str());
if (!set.count(key)) { if (!set.count(key)) {
result.emplace_back(g); result.emplace_back(g);
set.insert(key); set.insert(key);
......
...@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor( ...@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
for (size_t j = 0; j < lod.size(); ++j) { for (size_t j = 0; j < lod.size(); ++j) {
auto &sub_lod = new_lod[j]; auto &sub_lod = new_lod[j];
auto &offset = sub_lod.back(); size_t offset = sub_lod.back();
for (size_t k = 1; k < lod[j].size(); ++k) { for (size_t k = 1; k < lod[j].size(); ++k) {
sub_lod.push_back(lod[j][k] + offset); sub_lod.push_back(lod[j][k] + offset);
} }
......
...@@ -38,9 +38,20 @@ class ParallelExecutorPrivate { ...@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places) explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
: places_(places) {} : places_(places) {}
~ParallelExecutorPrivate() {
if (own_local_scope_) {
for (size_t i = 1; i < local_scopes_.size(); ++i) {
// Skip the first scope, since it is the global scope.
Scope *local_scope = local_scopes_[i];
if (global_scope_->HasKid(local_scope)) {
global_scope_->DeleteScope(local_scope);
}
}
}
}
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
Scope *global_scope_; Scope *global_scope_; // not owned
std::unique_ptr<details::SSAGraphExecutor> executor_; std::unique_ptr<details::SSAGraphExecutor> executor_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
for (auto &p : member_->places_) { for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait(); platform::DeviceContextPool::Instance().Get(p)->Wait();
} }
if (member_->own_local_scope_) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
Scope *local_scope = member_->local_scopes_[i];
if (member_->global_scope_->HasKid(local_scope)) {
member_->global_scope_->DeleteScope(local_scope);
}
}
}
// member_ must be destructed before gcs_ since the destructor of // member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside. // ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_.reset(); member_.reset();
......
...@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) { ...@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
platform::CPUPlace()); platform::CPUPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
// Not sure if it's desired, but currently, Tensor type can be changed.
{
framework::Tensor src_tensor;
int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
platform::CPUPlace());
EXPECT_NE(p1, nullptr);
*p1 = 1;
uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
platform::CPUPlace());
EXPECT_NE(p2, nullptr);
EXPECT_EQ(static_cast<int>(p2[0]), 1);
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
......
...@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
nullptr);
} }
#endif #endif
} }
......
...@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) { ...@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
ThreadPool::~ThreadPool() { ThreadPool::~ThreadPool() {
{ {
// notify all threads to stop running // notify all threads to stop running
std::lock_guard<std::mutex> l(mutex_); std::unique_lock<std::mutex> l(mutex_);
running_ = false; running_ = false;
scheduled_.notify_all();
} }
scheduled_.notify_all();
for (auto& t : threads_) { for (auto& t : threads_) {
t->join(); t->join();
...@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() { ...@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
void ThreadPool::TaskLoop() { void ThreadPool::TaskLoop() {
while (true) { while (true) {
std::unique_lock<std::mutex> lock(mutex_); Task task;
scheduled_.wait( {
lock, [this] { return !this->tasks_.empty() || !this->running_; }); std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(
lock, [this] { return !this->tasks_.empty() || !this->running_; });
if (!running_ || tasks_.empty()) { if (!running_ && tasks_.empty()) {
return; return;
} }
if (tasks_.empty()) {
PADDLE_THROW("This thread has no task to Run");
}
// pop a task from the task queue // pop a task from the task queue
auto task = std::move(tasks_.front()); task = std::move(tasks_.front());
tasks_.pop(); tasks_.pop();
lock.unlock(); }
// run the task // run the task
task(); task();
......
...@@ -58,7 +58,7 @@ class ThreadPool { ...@@ -58,7 +58,7 @@ class ThreadPool {
~ThreadPool(); ~ThreadPool();
// Run pushes a function to the task queue and returns a std::future // Run pushes a function to the task queue and returns a std::future
// object. To wait for the completion of the task, call // object. To wait for the completion of the task, call
// std::future::wait(). // std::future::wait().
template <typename Callback> template <typename Callback>
std::future<void> Run(Callback fn) { std::future<void> Run(Callback fn) {
...@@ -69,7 +69,6 @@ class ThreadPool { ...@@ -69,7 +69,6 @@ class ThreadPool {
template <typename Callback> template <typename Callback>
std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException( std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
Callback fn) { Callback fn) {
std::unique_lock<std::mutex> lock(mutex_);
Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> { Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
try { try {
fn(); fn();
...@@ -84,7 +83,13 @@ class ThreadPool { ...@@ -84,7 +83,13 @@ class ThreadPool {
return nullptr; return nullptr;
}); });
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future(); std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
tasks_.push(std::move(task)); {
std::unique_lock<std::mutex> lock(mutex_);
if (!running_) {
PADDLE_THROW("enqueue on stopped ThreadPool");
}
tasks_.push(std::move(task));
}
scheduled_.notify_one(); scheduled_.notify_one();
return f; return f;
} }
......
if(WITH_TESTING)
include(tests/test.cmake) # some generic cmake funtion for inference
endif()
# analysis and tensorrt must be added before creating static library, # analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library. # otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis) add_subdirectory(analysis)
......
...@@ -79,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -79,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass", //
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
...@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type, ...@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
op->SetType(type); op->SetType(type);
op->SetInput("Xs", inputs); op->SetInput("Xs", inputs);
op->SetOutput("Xs", outputs); op->SetOutput("Xs", outputs);
op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(framework::OpRole::kForward));
} }
TEST(DataFlowGraph, Build_IR_Graph) { TEST(DataFlowGraph, Build_IR_Graph) {
......
...@@ -17,39 +17,12 @@ if(APPLE) ...@@ -17,39 +17,12 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
endif() endif()
function(inference_api_test TARGET_NAME)
if (WITH_TESTING)
set(options "")
set(oneValueArgs SRC)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (WITH_GPU)
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
else()
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
endif()
if(inference_test_ARGS)
set_tests_properties(${TARGET_NAME}
PROPERTIES DEPENDS "${inference_test_ARGS}")
endif()
endif(WITH_TESTING)
endfunction(inference_api_test)
cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
...@@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api ...@@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
inference_api_test(test_api_impl SRC api_impl_tester.cc if(WITH_TESTING)
ARGS test_word2vec test_image_classification) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book) ARGS --dirname=${PYTHON_TESTS_DIR}/book)
...@@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
if(WITH_TESTING)
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
endif() endif()
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -22,12 +22,14 @@ limitations under the License. */ ...@@ -22,12 +22,14 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#ifdef __clang__ #ifdef __clang__
#define ACC_DIFF 4e-2 #define ACC_DIFF 4e-3
#else #else
#define ACC_DIFF 1e-2 #define ACC_DIFF 1e-3
#endif #endif
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(word2vec_dirname, "",
"Directory of the word2vec inference model.");
DEFINE_string(book_dirname, "", "Directory of the book inference model.");
namespace paddle { namespace paddle {
...@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { ...@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig GetConfig() { NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; config.model_dir = FLAGS_word2vec_dirname;
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) { ...@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
const bool is_combined = false; const bool is_combined = false;
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes =
...@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs); std::vector<framework::LoDTensor> jobs(num_jobs);
......
...@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
//# 1. Create PaddlePredictor with a config. //# 1. Create PaddlePredictor with a config.
NativeConfig config0; NativeConfig config0;
config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; config0.model_dir = FLAGS_dirname;
config0.use_gpu = true; config0.use_gpu = true;
config0.fraction_of_gpu_memory = 0.3; config0.fraction_of_gpu_memory = 0.3;
config0.device = 0; config0.device = 0;
MixedRTConfig config1; MixedRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; config1.model_dir = FLAGS_dirname;
config1.use_gpu = true; config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3; config1.fraction_of_gpu_memory = 0.3;
config1.device = 0; config1.device = 0;
......
...@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_GPU=$TEST_GPU_CPU \ -DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB -DWITH_STATIC_LIB=$WITH_STATIC_LIB
make -j make -j
word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
if [ -d $word2vec_model ]; then if [ -d $word2vec_model ]; then
for use_gpu in $use_gpu_list; do for use_gpu in $use_gpu_list; do
./simple_on_word2vec \ ./simple_on_word2vec \
......
...@@ -70,12 +70,8 @@ void Main(bool use_gpu) { ...@@ -70,12 +70,8 @@ void Main(bool use_gpu) {
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements); for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
i++) { i++) {
// Here will result random fail, for that the model is trained by CI, the CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
// train phase is not stable, so the result will be random. 0.001);
// TODO(Superjomn) will restore after the model is upload.
// CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
// result[i],
// 0.001);
} }
} }
} }
......
...@@ -18,6 +18,21 @@ namespace paddle { ...@@ -18,6 +18,21 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine_,
const std::vector<int>& filters,
const std::vector<int>& strides,
const std::vector<int>& paddings,
std::string input_name) {
if (engine_->itensor_quote_num[input_name] > 0) {
return true;
}
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine_->itensor_quote_num[input_name] += 1;
return false;
}
class Conv2dOpConverter : public OpConverter { class Conv2dOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
...@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter { ...@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("Input").front()); auto* X = engine_->GetITensor(op_desc.Input("Input").front());
// Declare weights // Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
...@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter { ...@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
std::move(weight_tensor); std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str()); layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
if (test_mode ||
to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
paddings, op_desc.Input("Input").front())) {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
} }
} }
......
...@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, ...@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
buffer_sizes_[name] = 0; buffer_sizes_[name] = 0;
} }
bool TensorRTEngine::HasDeclared(const std::string &name) {
return buffer_sizes_.count(name) > 0;
}
void TensorRTEngine::DeclareOutput(const std::string &name) { void TensorRTEngine::DeclareOutput(const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name); name);
......
...@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase { ...@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
const std::string& name); const std::string& name);
// Set the itensor_map_[name] as the network's output, and set its name. // Set the itensor_map_[name] as the network's output, and set its name.
void DeclareOutput(const std::string& name); void DeclareOutput(const std::string& name);
// Check if the ITensor has been declared
bool HasDeclared(const std::string& name);
// GPU memory address for an ITensor with specific name. One can operate on // GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted // these memory directly for acceleration, for example, output the converted
...@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase { ...@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>> std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map; weight_map;
// TODO: (NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into
// one conv, and then trigger bug. So, We should use strategy to avoid this
// optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num;
private: private:
// the max batch size // the max batch size
int max_batch_; int max_batch_;
......
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction()
function (inference_download_and_uncompress install_dir url filename) function(download_model install_dir model_name)
inference_download(${install_dir} ${url} ${filename}) if (NOT EXISTS ${install_dir})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
endif()
endfunction() endfunction()
function(download_model_and_data install_dir model_name data_name) function(download_model_and_data install_dir model_name data_name)
...@@ -27,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename) ...@@ -27,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction() endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
download_model(${install_dir} ${model_name})
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${install_dir}/model)
endfunction()
# RNN1 # RNN1
if(NOT APPLE) if(NOT APPLE)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
...@@ -43,6 +42,15 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") ...@@ -43,6 +42,15 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
# DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS
--infer_model=${DAM_INSTALL_DIR}/model
--infer_data=${DAM_INSTALL_DIR}/data.txt
--use_analysis=0)
# chinese_ner # chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
...@@ -66,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana ...@@ -66,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr # ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR}) if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif() endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
# resnet50 # resnet50
set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
if (NOT EXISTS ${RESNET50_INSTALL_DIR}) "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz")
endif()
inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model)
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
using contrib::AnalysisConfig;
#define MAX_TURN_NUM 9
#define MAX_TURN_LEN 50
static std::vector<float> result_data;
struct DataRecord {
std::vector<std::vector<int64_t>>
turns[MAX_TURN_NUM]; // turns data : MAX_TURN_NUM
std::vector<std::vector<float>>
turns_mask[MAX_TURN_NUM]; // turns mask data : MAX_TURN_NUM
std::vector<std::vector<int64_t>> response; // response data : 1
std::vector<std::vector<float>> response_mask; // response mask data : 1
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= response.size()) {
for (int i = 0; i < MAX_TURN_NUM; ++i) {
data.turns[i].assign(turns[i].begin() + batch_iter,
turns[i].begin() + batch_end);
}
for (int i = 0; i < MAX_TURN_NUM; ++i) {
data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
turns_mask[i].begin() + batch_end);
}
data.response.assign(response.begin() + batch_iter,
response.begin() + batch_end);
data.response_mask.assign(response_mask.begin() + batch_iter,
response_mask.begin() + batch_end);
CHECK(!data.response.empty());
CHECK(!data.response_mask.empty());
CHECK_EQ(data.response.size(), data.response_mask.size());
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
size_t num_lines = 0;
result_data.clear();
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ',', &data);
CHECK_EQ(data.size(), 2 * MAX_TURN_NUM + 3);
// load turn data
std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
for (int i = 0; i < MAX_TURN_NUM; ++i) {
split_to_int64(data[i], ' ', &turns_tmp[i]);
turns[i].push_back(std::move(turns_tmp[i]));
}
// load turn_mask data
std::vector<float> turns_mask_tmp[MAX_TURN_NUM];
for (int i = 0; i < MAX_TURN_NUM; ++i) {
split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]);
turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
}
// load response data
std::vector<int64_t> response_tmp;
split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp);
response.push_back(std::move(response_tmp));
// load response_mask data
std::vector<float> response_mask_tmp;
split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp);
response_mask.push_back(std::move(response_mask_tmp));
// load result data
float result_tmp;
result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]);
result_data.push_back(result_tmp);
}
num_samples = num_lines;
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor turns_tensor[MAX_TURN_NUM];
PaddleTensor turns_mask_tensor[MAX_TURN_NUM];
PaddleTensor response_tensor;
PaddleTensor response_mask_tensor;
std::string turn_pre = "turn_";
std::string turn_mask_pre = "turn_mask_";
auto one_batch = data->NextBatch();
int size = one_batch.response[0].size();
CHECK_EQ(size, MAX_TURN_LEN);
// turn tensor assignment
for (int i = 0; i < MAX_TURN_NUM; ++i) {
turns_tensor[i].name = turn_pre + std::to_string(i);
turns_tensor[i].shape.assign({batch_size, size, 1});
turns_tensor[i].dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
}
// turn mask tensor assignment
for (int i = 0; i < MAX_TURN_NUM; ++i) {
turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
turns_mask_tensor[i].shape.assign({batch_size, size, 1});
turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&turns_mask_tensor[i], one_batch.turns_mask[i]);
}
// response tensor assignment
response_tensor.name = "response";
response_tensor.shape.assign({batch_size, size, 1});
response_tensor.dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&response_tensor, one_batch.response);
// response mask tensor assignment
response_mask_tensor.name = "response_mask";
response_mask_tensor.shape.assign({batch_size, size, 1});
response_mask_tensor.dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
// Set inputs.
for (int i = 0; i < MAX_TURN_NUM; ++i) {
input_slots->push_back(std::move(turns_tensor[i]));
}
for (int i = 0; i < MAX_TURN_NUM; ++i) {
input_slots->push_back(std::move(turns_mask_tensor[i]));
}
input_slots->push_back(std::move(response_tensor));
input_slots->push_back(std::move(response_mask_tensor));
}
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots;
int test_batch_num =
FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "The number of samples to be test: "
<< test_batch_num * FLAGS_batch_size;
for (int bid = 0; bid < test_batch_num; ++bid) {
input_slots.clear();
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
// Easy for profiling independently.
TEST(Analyzer_dam, profile) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], result_data[i], 1e-3);
}
}
}
// Check the fuse status
TEST(Analyzer_dam, fuse_statis) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
if (FLAGS_use_analysis) {
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
EXPECT_EQ(num_ops, 2020);
}
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_dam, compare) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
if (FLAGS_use_analysis) {
CompareNativeAndAnalysis(cfg, input_slots_all);
}
}
} // namespace inference
} // namespace paddle
...@@ -20,7 +20,6 @@ using contrib::AnalysisConfig; ...@@ -20,7 +20,6 @@ using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> word_data_all, mention_data_all; std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
std::vector<std::vector<int64_t>> rnn_word_datas, rnn_mention_datas;
std::vector<size_t> lod; // two inputs have the same lod info. std::vector<size_t> lod; // two inputs have the same lod info.
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
...@@ -45,8 +44,6 @@ struct DataRecord { ...@@ -45,8 +44,6 @@ struct DataRecord {
CHECK(!data.mention_data_all.empty()); CHECK(!data.mention_data_all.empty());
CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
for (size_t j = 0; j < data.word_data_all.size(); j++) { for (size_t j = 0; j < data.word_data_all.size(); j++) {
data.rnn_word_datas.push_back(data.word_data_all[j]);
data.rnn_mention_datas.push_back(data.mention_data_all[j]);
// calculate lod // calculate lod
data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
} }
...@@ -87,8 +84,8 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -87,8 +84,8 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
lod_mention_tensor.shape.assign({size, 1}); lod_mention_tensor.shape.assign({size, 1});
lod_mention_tensor.lod.assign({one_batch.lod}); lod_mention_tensor.lod.assign({one_batch.lod});
// assign data // assign data
TensorAssignData<int64_t>(&lod_word_tensor, one_batch.rnn_word_datas); TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word_data_all);
TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.rnn_mention_datas); TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention_data_all);
// Set inputs. // Set inputs.
input_slots->assign({lod_word_tensor, lod_mention_tensor}); input_slots->assign({lod_word_tensor, lod_mention_tensor});
for (auto &tensor : *input_slots) { for (auto &tensor : *input_slots) {
......
...@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); SetFakeImageInput(inputs, FLAGS_infer_model);
PaddleTensor input;
// channel=3, height/width=318
std::vector<int> shape({FLAGS_batch_size, 3, 318, 318});
input.shape = shape;
input.dtype = PaddleDType::FLOAT32;
// fill input data, for profile easily, do not use random data here.
size_t size = FLAGS_batch_size * 3 * 318 * 318;
input.data.Resize(size * sizeof(float));
float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) {
*(input_data + i) = static_cast<float>(i) / size;
}
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
} }
// Easy for profiling independently. // Easy for profiling independently.
...@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) { ...@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
// output is a 512-dimension feature
EXPECT_EQ(size, 512 * FLAGS_batch_size);
}
} }
TEST(Analyzer_resnet50, profile) { profile(); } TEST(Analyzer_resnet50, profile) { profile(); }
...@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) { ...@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis( auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops); static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
...@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor, ...@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
return fuse_statis; return fuse_statis;
} }
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname) {
// Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes =
GetFeedTargetShapes(dirname, true, "model", "params");
int dim1 = feed_target_shapes[0][1];
int dim2 = feed_target_shapes[0][2];
int dim3 = feed_target_shapes[0][3];
PaddleTensor input;
std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
input.shape = shape;
input.dtype = PaddleDType::FLOAT32;
// fill input data, for profile easily, do not use random data here.
size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
input.data.Resize(size * sizeof(float));
float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) {
*(input_data + i) = static_cast<float>(i) / size;
}
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
void TestOneThreadPrediction( void TestOneThreadPrediction(
const AnalysisConfig &config, const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
......
...@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { ...@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
} }
} }
TEST(trt_models_test, main) { TEST(trt_models_test, mobilenet) {
std::vector<std::string> infer_models = {"mobilenet", "resnet50", CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
"resnext50"}; }
for (auto &model_dir : infer_models) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir); TEST(trt_models_test, resnet50) {
} CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
} }
TEST(trt_models_test, resnext50) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
}
} // namespace paddle } // namespace paddle
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
function (inference_base_test TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS ARGS DEPS)
cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
endif()
cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
endfunction()
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,
std::unique_ptr<paddle::framework::ProgramDesc> InitProgram( std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
paddle::framework::Executor* executor, paddle::framework::Scope* scope, paddle::framework::Executor* executor, paddle::framework::Scope* scope,
const std::string& dirname, const bool is_combined = false) { const std::string& dirname, const bool is_combined = false,
const std::string& prog_filename = "__model_combined__",
const std::string& param_filename = "__params_combined__") {
std::unique_ptr<paddle::framework::ProgramDesc> inference_program; std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (is_combined) { if (is_combined) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest. // Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API // The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`. // `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = inference_program =
paddle::inference::Load(executor, scope, dirname + "/" + prog_filename, paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
dirname + "/" + param_filename); dirname + "/" + param_filename);
...@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram( ...@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
} }
std::vector<std::vector<int64_t>> GetFeedTargetShapes( std::vector<std::vector<int64_t>> GetFeedTargetShapes(
const std::string& dirname, const bool is_combined = false) { const std::string& dirname, const bool is_combined = false,
const std::string& prog_filename = "__model_combined__",
const std::string& param_filename = "__params_combined__") {
auto place = paddle::platform::CPUPlace(); auto place = paddle::platform::CPUPlace();
auto executor = paddle::framework::Executor(place); auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope(); auto* scope = new paddle::framework::Scope();
auto inference_program = InitProgram(&executor, scope, dirname, is_combined); auto inference_program = InitProgram(&executor, scope, dirname, is_combined,
prog_filename, param_filename);
auto& global_block = inference_program->Block(0); auto& global_block = inference_program->Block(0);
const std::vector<std::string>& feed_target_names = const std::vector<std::string>& feed_target_names =
...@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes( ...@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
return feed_target_shapes; return feed_target_shapes;
} }
void Compile(paddle::framework::ProgramDesc* program) {
std::unique_ptr<paddle::framework::ir::Graph> g(
new paddle::framework::ir::Graph(*program));
auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
"graph_to_program_pass");
pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
pass->Apply(std::move(g));
}
template <typename Place, bool CreateVars = true, bool PrepareContext = false> template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
...@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname, ...@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
inference_program = InitProgram(&executor, scope, dirname, is_combined); inference_program = InitProgram(&executor, scope, dirname, is_combined);
} }
Compile(inference_program.get());
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
...@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname, ...@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,
delete scope; delete scope;
} }
USE_PASS(graph_to_program_pass);
...@@ -26,6 +26,8 @@ namespace plat = paddle::platform; ...@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \ act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \ ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \ ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>); ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
...@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> { ...@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut, template <typename Device, typename X, typename Out, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
const Out out_conj = Eigen::numext::conj(out); dx.device(d) = static_cast<T>(0.5) * dout / out;
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
} }
}; };
...@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> { ...@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(factor) * dx.device(d) = dout * static_cast<T>(factor) *
x.pow(static_cast<T>(factor - static_cast<T>(1))); x.pow(static_cast<T>(factor) - static_cast<T>(1));
} }
}; };
......
...@@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> { ...@@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>(); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
// 2. m += g_m * g_m // 2. m += g_m * g_m
math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func; auto grad_square =
auto grad_square = sqare_func(context, grad_merge, grad_merge); SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);
math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor; math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
functor(context, grad_square, moment); functor(context, grad_square, moment);
......
...@@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>(); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
framework::Vector<int64_t> merge_rows(grad_merge.rows()); framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m // 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func; auto grad_square =
auto grad_square = sqare_func(context, grad_merge, grad_merge); SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);
math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor; math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
functor(context, grad_square, moment); functor(context, grad_square, moment);
......
...@@ -28,6 +28,20 @@ struct SparseAdagradFunctor { ...@@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
framework::Tensor *moment, framework::Tensor *param); framework::Tensor *moment, framework::Tensor *param);
}; };
template <typename DeviceContext, typename T>
framework::SelectedRows SquareSelectedRows(
const DeviceContext &context, const framework::SelectedRows &input) {
framework::SelectedRows out;
out.set_rows(input.rows());
out.set_height(input.height());
out.mutable_value()->mutable_data<T>(input.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in = framework::EigenVector<T>::Flatten(input.value());
e_out.device(*context.eigen_device()) = e_in.square();
return out;
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdagradOpKernel : public framework::OpKernel<T> { class AdagradOpKernel : public framework::OpKernel<T> {
public: public:
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedSpatialTransformerDescriptor =
platform::ScopedSpatialTransformerDescriptor;
template <typename T>
class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto* theta = ctx.Input<Tensor>("Theta");
auto* output = ctx.Output<Tensor>("Output");
const T* theta_data = theta->data<T>();
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
Tensor h_sizes;
int* h_size_data;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
h_size_data = h_sizes.data<int>();
} else {
h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
h_size_data[0] = n;
h_size_data[1] = size_attr[1];
h_size_data[2] = size_attr[2];
h_size_data[3] = size_attr[3];
}
T* output_data = output->mutable_data<T>(
{n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
handle, cudnn_st_desc, theta_data, output_data));
}
};
template <typename T>
class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
Tensor h_sizes;
int* h_size_data;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
h_size_data = h_sizes.data<int>();
} else {
h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
h_size_data[0] = n;
h_size_data[1] = size_attr[1];
h_size_data[2] = size_attr[2];
h_size_data[3] = size_attr[3];
}
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
const T* output_grad_data = output_grad->data<T>();
T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
handle, cudnn_st_desc, output_grad_data, theta_grad_data));
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNAffineGridOpKernel<float>,
paddle::operators::CUDNNAffineGridOpKernel<double>);
REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNAffineGridGradOpKernel<float>,
paddle::operators::CUDNNAffineGridGradOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/affine_grid_op.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
struct Linspace<paddle::platform::CPUDeviceContext, T> {
void operator()(T start, T end, int count, framework::Tensor* numbers,
const framework::ExecutionContext& ctx) {
T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
T slice = (end - start) / (T)(count - 1);
for (int i = 0; i < count; ++i) {
number_data[i] = start + (T)i * slice;
}
}
};
class AffineGridOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Theta"),
"Input(Theta) of AffineGridOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of AffineGridOp should not be null.");
auto theta_dims = ctx->GetInputDim("Theta");
PADDLE_ENFORCE(theta_dims.size() == 3,
"AffineGrid's Input(Theta) should be 3-D tensor.");
auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
if (output_shape.size() == 0) {
PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
"Input(OutputShape) of AffineGridOp should not be null if "
"attr(output_shape) is not configured.");
auto output_shape_dims = ctx->GetInputDim("OutputShape");
PADDLE_ENFORCE(output_shape_dims.size() == 1,
"AffineGrid's Input(OutputShape) should be 1-D tensor.");
} else {
PADDLE_ENFORCE(output_shape.size() == 4,
"The size of attr(output_shape) should be 4.");
}
PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
// N * H * W * 2
ctx->SetOutputDim("Output",
framework::make_ddim({theta_dims[0], -1, -1, 2}));
ctx->ShareLoD("Theta", "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library = framework::LibraryType::kCUDNN;
}
#endif
auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
return framework::OpKernelType(data_type, ctx.GetPlace(),
framework::DataLayout::kAnyLayout, library);
}
};
class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Theta",
"(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
"It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
"y_1).");
AddInput("OutputShape",
"(Tensor) The shape of target image with format [N, C, H, W].")
.AsDispensable();
AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(true);
AddAttr<std::vector<int>>(
"output_shape",
"The target output image shape with format [N, C, H, W].")
.SetDefault(std::vector<int>());
AddComment(R"DOC(
It generates a grid of (x,y) coordinates using the parameters of the
affine transformation that correspond to a set of points where the input
feature map should be sampled to produce the transformed output feature map.
Given:
Theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
OutputShape = [2, 3, 5, 5]
Step 1:
Generate relative coordinates according to OutputShape.
The values of relative coordinates are in the interval between -1 and 1.
The shape of the relative coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
)DOC");
}
};
class AffineGridOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto theta_dims = ctx->GetInputDim("Theta");
if (ctx->HasOutput(framework::GradVarName("Theta"))) {
ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
}
};
class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("affine_grid_grad");
op->SetInput("Theta", Input("Theta"));
op->SetInput("OutputShape", Input("OutputShape"));
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
ops::AffineGridGradMaker);
REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
REGISTER_OP_CPU_KERNEL(
affine_grid,
ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
affine_grid_grad,
ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Array1 = Eigen::DSizes<int64_t, 1>;
using Array2 = Eigen::DSizes<int64_t, 2>;
using Array3 = Eigen::DSizes<int64_t, 3>;
using Array4 = Eigen::DSizes<int64_t, 4>;
/**
*Return a tensor with evenly spaced numbers over a specified interval.
*/
template <typename DeviceContext, typename T>
struct Linspace {
void operator()(T start, T end, int count, framework::Tensor* numbers,
const framework::ExecutionContext& ctx);
};
template <typename DeviceContext, typename T>
inline void GetIdxMap(int n, int h, int w, Tensor* grid,
const framework::ExecutionContext& ctx) {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
auto grid_t = EigenTensor<T, 4>::From(*grid);
// Get indexes of height with shape [height, width, 1]
Tensor h_idx;
Linspace<DeviceContext, T> linspace;
linspace((T)-1, (T)1, h, &h_idx, ctx);
auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
// Get indexes of width with shape [height, width, 1]
Tensor w_idx;
linspace((T)-1, (T)1, w, &w_idx, ctx);
auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
// Get constant ones tensor with shape [height, width, 1]
Tensor ones;
ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor w_idx_map;
w_idx_map.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto w_idx_map_t = EigenTensor<T, 3>::From(w_idx_map);
Tensor h_idx_map;
h_idx_map.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto h_idx_map_t = EigenTensor<T, 3>::From(h_idx_map);
Tensor w_h_idx_map;
w_h_idx_map.mutable_data<T>({h, w, 2}, ctx.GetPlace());
auto w_h_idx_map_t = EigenTensor<T, 3>::From(w_h_idx_map);
Tensor w_h_one_idx_map;
w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
.broadcast(Array2(h, 1))
.reshape(Array3(h, w, 1));
h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
.broadcast(Array2(w, 1))
.shuffle(Array2(1, 0))
.reshape(Array3(h, w, 1));
w_h_idx_map_t.device(place) = w_idx_map_t.concatenate(h_idx_map_t, 2);
w_h_one_idx_map_t.device(place) = w_h_idx_map_t.concatenate(ones_t, 2);
grid_t.device(place) = w_h_one_idx_map_t.reshape(Array4(1, h, w, 3))
.broadcast(Array4(n, 1, 1, 1));
}
template <typename DeviceContext, typename T>
class AffineGridOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* theta = ctx.Input<Tensor>("Theta");
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
auto* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), output,
static_cast<T>(0));
Tensor grid;
GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
T(0));
}
}
};
template <typename DeviceContext, typename T>
class AffineGridGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), theta_grad,
static_cast<T>(0));
Tensor grid;
GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
&sliced_theta_grad, T(0));
}
}
};
} // namespace operators
} // namespace paddle
...@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias")); auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace()); d_x->mutable_data<T>(ctx.GetPlace());
d_scale->mutable_data<T>(ctx.GetPlace()); d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace()); d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if ((N * H * W * D) == 1) { if ((N * H * W * D) == 1) {
...@@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
const auto *saved_mean = ctx.Input<Tensor>("SavedMean"); const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance"); const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const void *saved_mean_data = saved_mean->template data<T>(); const void *saved_mean_data =
const void *saved_var_data = saved_var->template data<T>(); saved_mean->template data<BatchNormParamType<T>>();
const void *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(), dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
...@@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(), CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
data_desc_, d_y->template data<T>(), data_desc_, data_desc_, d_y->template data<T>(), data_desc_,
d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_, d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
scale->template data<T>(), scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<T>(ctx.GetPlace()), d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon, d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
saved_mean_data, saved_var_data)); epsilon, saved_mean_data, saved_var_data));
// clean when exit. // clean when exit.
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
...@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>); ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>, batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>); ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -38,9 +38,10 @@ class CheckpointNotifyOp : public framework::OperatorBase { ...@@ -38,9 +38,10 @@ class CheckpointNotifyOp : public framework::OperatorBase {
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap"); std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
std::string dir = Attr<std::string>("dir"); std::string dir = Attr<std::string>("dir");
std::string lookup_table_name = Attr<std::string>("lookup_table"); std::string lookup_table_name = Attr<std::string>("lookup_table");
int trainer_id = Attr<int>("trainer_id");
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
for (size_t i = 0; i < epmap.size(); i++) { for (size_t i = 0; i < epmap.size(); i++) {
auto lookup_table_save_dir = auto lookup_table_save_dir =
string::Sprintf("%s/%s_%d", dir, lookup_table_name, i); string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
...@@ -63,6 +64,7 @@ class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -63,6 +64,7 @@ class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
"dir", "(string, default '') indicate the folder checkpoint will use"); "dir", "(string, default '') indicate the folder checkpoint will use");
AddAttr<std::string>("lookup_table", AddAttr<std::string>("lookup_table",
"(string, default '') the lookup table name"); "(string, default '') the lookup table name");
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
CheckpointNotify operator CheckpointNotify operator
......
...@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo // Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
VLOG(5) << "use cudnn_tensor_op_math";
} else { } else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math";
} }
#endif #endif
...@@ -160,6 +162,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -160,6 +162,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
...@@ -168,7 +171,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -168,7 +171,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out)); &beta, cudnn_output_desc, output_data + i * group_offset_out));
}; };
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
...@@ -314,6 +317,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -314,6 +317,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
...@@ -327,7 +331,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -327,7 +331,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in)); cudnn_input_desc, input_grad_data + i * group_offset_in));
}; };
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
...@@ -343,7 +347,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -343,7 +347,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
}; };
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
} }
...@@ -359,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, ...@@ -359,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>); paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>); paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/framework/data_layout_transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -57,6 +59,11 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -57,6 +59,11 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
return conv_pd_->dst_primitive_desc().get_size(); return conv_pd_->dst_primitive_desc().get_size();
} }
mkldnn::memory::format GetDstFormat() const {
return static_cast<mkldnn::memory::format>(
conv_pd_->dst_primitive_desc().desc().data.format);
}
size_t GetDiffWeightsMemorySize() const { size_t GetDiffWeightsMemorySize() const {
return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size(); return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
} }
...@@ -108,6 +115,20 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -108,6 +115,20 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
"@data-weights_mem_p", pipeline); "@data-weights_mem_p", pipeline);
} }
std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
void* dst_ptr,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
return this->AcquireMemory(user_residual_memory_p,
this->AcquireDstMemoryFromPrimitive(dst_ptr),
"@residual_data_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive( std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
void* ptr) { void* ptr) {
return this->AcquireMemoryFromPrimitive( return this->AcquireMemoryFromPrimitive(
...@@ -386,7 +407,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -386,7 +407,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_weights_memory_p = handler.AcquireWeightsMemory( auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data)); user_weights_md, to_void_cast<T>(filter_data));
T* output_data = nullptr; // create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
std::shared_ptr<mkldnn::memory> dst_memory_p;
if (fuse_residual_conn) { if (fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData"); auto residual_param = ctx.Input<Tensor>("ResidualData");
...@@ -399,21 +426,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -399,21 +426,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
"Output and elementwise parameter need to have the " "Output and elementwise parameter need to have the "
"same dimension sizes"); "same dimension sizes");
output->ShareDataWith(*residual_param); if (residual_param->format() != handler.GetDstFormat()) {
output_data = output->mutable_data<T>(ctx.GetPlace()); auto output_data =
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
auto residual_data_tz =
paddle::framework::vectorize2int(residual_param->dims());
auto residual_data_type =
paddle::framework::ToMKLDNNDataType(residual_param->type());
auto user_residual_md = platform::MKLDNNMemDesc(
residual_data_tz, residual_data_type, residual_param->format());
auto user_residual_memory_p = handler.AcquireResidualDataMemory(
user_residual_md, to_void_cast<T>(residual_param_data));
dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory(
user_residual_memory_p, to_void_cast<T>(output_data), pipeline);
} else {
output->ShareDataWith(*residual_param);
auto output_data = output->mutable_data<T>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
}
} else { } else {
output_data = auto output_data =
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize()); output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
auto dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
// create convolution op primitive // create convolution op primitive
std::shared_ptr<mkldnn::convolution_forward> conv_p; std::shared_ptr<mkldnn::convolution_forward> conv_p;
if (bias) { if (bias) {
......
...@@ -104,6 +104,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -104,6 +104,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int output_offset = output->numel() / output->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
...@@ -112,7 +113,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -112,7 +113,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
algo, cudnn_workspace, workspace_size_in_bytes, &beta, algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g)); cudnn_output_desc, output_data + output_offset * g));
}; };
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
...@@ -208,6 +209,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -208,6 +209,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_grad->numel() / output_grad->dims()[0] / groups; output_grad->numel() / output_grad->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
...@@ -220,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -220,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g)); input_grad_data + input_offset * g));
}; };
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
...@@ -238,7 +240,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -238,7 +240,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g)); cudnn_filter_desc, filter_grad_data + filter_offset * g));
}; };
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
} }
......
...@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDACtx = paddle::platform::CUDADeviceContext; using CUDACtx = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(cross_entropy, REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>, ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>); ops::CrossEntropyOpKernel<CUDACtx, double>,
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>); REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
...@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase { ...@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
} }
}; };
class DeleteVarOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {}
};
class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -48,4 +53,5 @@ It should not be configured by users directly. ...@@ -48,4 +53,5 @@ It should not be configured by users directly.
REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker,
paddle::operators::DeleteVarOpInfoMaker); paddle::operators::DeleteVarOpInfoMaker,
paddle::operators::DeleteVarOpShapeInference);
...@@ -79,7 +79,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -79,7 +79,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
auto* var = p_scope->FindVar(var_name_val); auto* var = p_scope->FindVar(var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(var_name_val, var, *p_ctx, &req); SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
...@@ -105,7 +105,10 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -105,7 +105,10 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) { const ::grpc::ByteBuffer& ret_msg) {
framework::Variable* outvar = nullptr; framework::Variable* outvar = nullptr;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar); // get response's trainer_id is not used
int trainer_id;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
&trainer_id);
} }
template <typename T> template <typename T>
...@@ -135,6 +138,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -135,6 +138,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
req.set_trainer_id(trainer_id_);
::grpc::ByteBuffer buf; ::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
......
...@@ -34,8 +34,8 @@ namespace distributed { ...@@ -34,8 +34,8 @@ namespace distributed {
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg, ::grpc::ByteBuffer* msg, const std::string& out_name,
const std::string& out_name) { const int trainer_id) {
platform::RecordRPCEvent record_event("serial", &ctx); platform::RecordRPCEvent record_event("serial", &ctx);
// Default DestroyCallback does nothing, When using GPU // Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed. // the CPU buffer need to be freed.
...@@ -45,6 +45,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -45,6 +45,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
size_t payload_size; size_t payload_size;
request.set_varname(name); request.set_varname(name);
request.set_trainer_id(trainer_id);
// Note: normally the profiler is enabled in 1 trainer, hence only // Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS // 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the // servers the trainer's profiling state so that PS can follow the
...@@ -147,11 +148,12 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -147,11 +148,12 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var) { framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial", &ctx); platform::RecordRPCEvent record_event("deserial", &ctx);
operators::distributed::GRPCVariableResponse resp(scope, &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
*trainer_id = resp.GetTrainerId();
} }
} // namespace distributed } // namespace distributed
......
...@@ -38,12 +38,13 @@ typedef void (*DestroyCallback)(void*); ...@@ -38,12 +38,13 @@ typedef void (*DestroyCallback)(void*);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg, ::grpc::ByteBuffer* msg,
const std::string& out_varname = std::string()); const std::string& out_varname = std::string(),
const int trainer_id = 0);
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var); framework::Variable** var, int* trainer_id);
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
......
...@@ -102,9 +102,10 @@ class RequestSend final : public RequestBase { ...@@ -102,9 +102,10 @@ class RequestSend final : public RequestBase {
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = request_->GetVar(); auto invar = request_->GetVar();
int trainer_id = request_->GetTrainerId();
framework::Variable* outvar = nullptr; framework::Variable* outvar = nullptr;
request_handler_->Handle(varname, scope, invar, &outvar); request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
Finish(reply_, &responder_); Finish(reply_, &responder_);
} }
...@@ -133,13 +134,14 @@ class RequestGet final : public RequestBase { ...@@ -133,13 +134,14 @@ class RequestGet final : public RequestBase {
void Process() override { void Process() override {
// proc request. // proc request.
std::string varname = request_.varname(); std::string varname = request_.varname();
int trainer_id = request_.trainer_id();
VLOG(4) << "RequestGet " << varname; VLOG(4) << "RequestGet " << varname;
auto scope = request_handler_->scope(); auto scope = request_handler_->scope();
auto invar = scope->FindVar(varname); auto invar = scope->FindVar(varname);
framework::Variable* outvar = nullptr; framework::Variable* outvar = nullptr;
request_handler_->Handle(varname, scope, invar, &outvar); request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
if (outvar) { if (outvar) {
SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
...@@ -179,6 +181,7 @@ class RequestPrefetch final : public RequestBase { ...@@ -179,6 +181,7 @@ class RequestPrefetch final : public RequestBase {
// prefetch process... // prefetch process...
std::string in_var_name = request_->Varname(); std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname(); std::string out_var_name = request_->OutVarname();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name; << " out_var_name: " << out_var_name;
...@@ -187,7 +190,8 @@ class RequestPrefetch final : public RequestBase { ...@@ -187,7 +190,8 @@ class RequestPrefetch final : public RequestBase {
// out var must be created in local scope! // out var must be created in local scope!
framework::Variable* outvar = scope->Var(out_var_name); framework::Variable* outvar = scope->Var(out_var_name);
request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
out_var_name);
SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
&reply_); &reply_);
...@@ -225,12 +229,13 @@ class RequestCheckpointNotify final : public RequestBase { ...@@ -225,12 +229,13 @@ class RequestCheckpointNotify final : public RequestBase {
std::string checkpoint_notify = request_->Varname(); std::string checkpoint_notify = request_->Varname();
std::string checkpoint_dir = request_->OutVarname(); std::string checkpoint_dir = request_->OutVarname();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
<< ", dir: " << checkpoint_dir; << ", dir: " << checkpoint_dir;
request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
checkpoint_dir); trainer_id, checkpoint_dir);
Finish(reply_, &responder_); Finish(reply_, &responder_);
} }
......
...@@ -286,13 +286,21 @@ int GRPCVariableResponse::Parse(Source* source) { ...@@ -286,13 +286,21 @@ int GRPCVariableResponse::Parse(Source* source) {
platform::EnableProfiler(platform::ProfilerState::kCPU); platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (profiling == platform::kDisableProfiler && } else if (profiling == platform::kDisableProfiler &&
platform::IsProfileEnabled()) { platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler( platform::DisableProfiler(
platform::EventSortingKey::kDefault, platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id)); string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
listener_id));
} }
break; break;
} }
case sendrecv::VariableMessage::kTrainerIdFieldNumber: {
uint64_t trainer_id = 0;
if (!input.ReadVarint64(&trainer_id)) {
return tag;
}
meta_.set_trainer_id(trainer_id);
break;
}
default: { default: {
// Unknown tag, return unknown error. // Unknown tag, return unknown error.
return -1; return -1;
......
...@@ -190,6 +190,7 @@ class RequestHandler { ...@@ -190,6 +190,7 @@ class RequestHandler {
// } // }
virtual bool Handle(const std::string& varname, framework::Scope* scope, virtual bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar, framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") = 0; const std::string& out_var_name = "") = 0;
protected: protected:
......
...@@ -36,6 +36,7 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -36,6 +36,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestSendHandler:" << varname; VLOG(4) << "RequestSendHandler:" << varname;
...@@ -50,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -50,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async // Async
if (!sync_mode_) { if (!sync_mode_) {
VLOG(3) << "async process var: " << varname; VLOG(3) << "async process var: " << varname;
rpc_server_->Profiler().OneStep();
try { try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope); scope);
...@@ -76,6 +76,7 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -76,6 +76,7 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestGetHandler:" << varname; VLOG(4) << "RequestGetHandler:" << varname;
if (sync_mode_) { if (sync_mode_) {
...@@ -88,6 +89,19 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -88,6 +89,19 @@ bool RequestGetHandler::Handle(const std::string& varname,
} }
} else { } else {
if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
if (enable_dc_asgd_) {
// NOTE: the format is determined by distributed_transpiler.py
std::string param_bak_name =
string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
auto var = scope_->FindVar(varname);
auto t_orig = var->Get<framework::LoDTensor>();
auto param_bak = scope_->Var(param_bak_name);
auto t = param_bak->GetMutable<framework::LoDTensor>();
t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
VLOG(3) << "copying " << varname << " to " << param_bak_name;
framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
}
*outvar = scope_->FindVar(varname); *outvar = scope_->FindVar(varname);
} }
} }
...@@ -98,6 +112,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, ...@@ -98,6 +112,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestPrefetchHandler " << varname; VLOG(4) << "RequestPrefetchHandler " << varname;
...@@ -113,6 +128,7 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, ...@@ -113,6 +128,7 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
checkpoint_notify_id != -1, checkpoint_notify_id != -1,
......
...@@ -36,20 +36,34 @@ namespace distributed { ...@@ -36,20 +36,34 @@ namespace distributed {
class RequestSendHandler final : public RequestHandler { class RequestSendHandler final : public RequestHandler {
public: public:
explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {} explicit RequestSendHandler(bool sync_mode, bool enable_dc_asgd = false)
: RequestHandler(sync_mode) {
enable_dc_asgd_ = enable_dc_asgd;
}
virtual ~RequestSendHandler() {} virtual ~RequestSendHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope, bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar, framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
private:
bool enable_dc_asgd_;
}; };
class RequestGetHandler final : public RequestHandler { class RequestGetHandler final : public RequestHandler {
public: public:
explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {} explicit RequestGetHandler(bool sync_mode, bool enable_dc_asgd = false)
: RequestHandler(sync_mode) {
enable_dc_asgd_ = enable_dc_asgd;
}
virtual ~RequestGetHandler() {} virtual ~RequestGetHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope, bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar, framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
private:
bool enable_dc_asgd_;
}; };
class RequestPrefetchHandler final : public RequestHandler { class RequestPrefetchHandler final : public RequestHandler {
...@@ -58,6 +72,7 @@ class RequestPrefetchHandler final : public RequestHandler { ...@@ -58,6 +72,7 @@ class RequestPrefetchHandler final : public RequestHandler {
virtual ~RequestPrefetchHandler() {} virtual ~RequestPrefetchHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope, bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar, framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
}; };
...@@ -70,6 +85,7 @@ class RequestCheckpointHandler final : public RequestHandler { ...@@ -70,6 +85,7 @@ class RequestCheckpointHandler final : public RequestHandler {
virtual ~RequestCheckpointHandler() {} virtual ~RequestCheckpointHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope, bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar, framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
private: private:
......
...@@ -24,6 +24,7 @@ namespace distributed { ...@@ -24,6 +24,7 @@ namespace distributed {
std::once_flag RPCClient::init_flag_; std::once_flag RPCClient::init_flag_;
std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr); std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
int RPCClient::trainer_id_ = 0;
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
......
...@@ -72,14 +72,15 @@ class RPCClient { ...@@ -72,14 +72,15 @@ class RPCClient {
virtual bool Wait() = 0; virtual bool Wait() = 0;
template <typename T> template <typename T>
static RPCClient* GetInstance() { static RPCClient* GetInstance(int trainer_id) {
std::call_once(init_flag_, &RPCClient::Init<T>); std::call_once(init_flag_, &RPCClient::Init<T>, trainer_id);
return rpc_client_.get(); return rpc_client_.get();
} }
// Init is called by GetInstance. // Init is called by GetInstance.
template <typename T> template <typename T>
static void Init() { static void Init(int trainer_id) {
trainer_id_ = trainer_id;
if (rpc_client_.get() == nullptr) { if (rpc_client_.get() == nullptr) {
rpc_client_.reset(new T()); rpc_client_.reset(new T());
rpc_client_->InitImpl(); rpc_client_->InitImpl();
...@@ -88,6 +89,8 @@ class RPCClient { ...@@ -88,6 +89,8 @@ class RPCClient {
protected: protected:
virtual void InitImpl() {} virtual void InitImpl() {}
// each trainer have exact one trainer id, it should be static
static int trainer_id_;
private: private:
static std::once_flag init_flag_; static std::once_flag init_flag_;
......
...@@ -20,42 +20,10 @@ ...@@ -20,42 +20,10 @@
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_server_profile_period, 0,
"the period of listen_and_serv to do profile");
DEFINE_string(rpc_server_profile_path, "/dev/null",
"the profile log file path");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
RPCServerProfiler::RPCServerProfiler(int profile_period,
const std::string& profile_log_path)
: profile_period_(profile_period), profile_log_path_(profile_log_path) {
step_ = 0;
}
void RPCServerProfiler::OneStep() {
PADDLE_ENFORCE_LE(step_, profile_period_,
"step_ should not be larger then "
"profile_period_");
if (profile_period_ <= 0) {
return;
}
if (step_ == 0) {
auto pf_state = paddle::platform::ProfilerState::kCPU;
paddle::platform::EnableProfiler(pf_state);
}
if (step_ == profile_period_) {
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
profile_log_path_);
step_ = 0;
} else {
step_++;
}
}
void RPCServer::ShutDown() { void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown "; LOG(INFO) << "RPCServer ShutDown ";
ShutDownImpl(); ShutDownImpl();
......
...@@ -23,30 +23,14 @@ ...@@ -23,30 +23,14 @@
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_server_profile_period);
DECLARE_string(rpc_server_profile_path);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
class RPCServerProfiler {
public:
RPCServerProfiler(int profile_period, const std::string& profile_log_path);
void OneStep();
private:
const int profile_period_;
std::string profile_log_path_;
int step_;
};
class RPCServer { class RPCServer {
public: public:
explicit RPCServer(const std::string& address, int client_num) explicit RPCServer(const std::string& address, int client_num)
: cur_cond_(0), : cur_cond_(0),
profiler_(FLAGS_rpc_server_profile_period,
FLAGS_rpc_server_profile_path),
bind_address_(address), bind_address_(address),
exit_flag_(false), exit_flag_(false),
selected_port_(0), selected_port_(0),
...@@ -86,7 +70,6 @@ class RPCServer { ...@@ -86,7 +70,6 @@ class RPCServer {
void Complete(); void Complete();
void ResetBarrierCounter(); void ResetBarrierCounter();
RPCServerProfiler& Profiler() { return profiler_; }
bool NeedResetAllVars(); bool NeedResetAllVars();
...@@ -101,7 +84,6 @@ class RPCServer { ...@@ -101,7 +84,6 @@ class RPCServer {
std::unordered_map<std::string, int> rpc_cond_map_; std::unordered_map<std::string, int> rpc_cond_map_;
std::atomic<int> cur_cond_; std::atomic<int> cur_cond_;
std::condition_variable rpc_cond_; std::condition_variable rpc_cond_;
RPCServerProfiler profiler_;
protected: protected:
std::string bind_address_; std::string bind_address_;
......
...@@ -125,7 +125,7 @@ TEST(PREFETCH, CPU) { ...@@ -125,7 +125,7 @@ TEST(PREFETCH, CPU) {
g_req_handler.reset(new distributed::RequestPrefetchHandler(true)); g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
distributed::RPCClient* client = distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
std::thread server_thread(StartServer, distributed::kRequestPrefetch); std::thread server_thread(StartServer, distributed::kRequestPrefetch);
g_rpc_service->WaitServerReady(); g_rpc_service->WaitServerReady();
...@@ -165,7 +165,7 @@ TEST(COMPLETE, CPU) { ...@@ -165,7 +165,7 @@ TEST(COMPLETE, CPU) {
g_req_handler.reset(new distributed::RequestSendHandler(true)); g_req_handler.reset(new distributed::RequestSendHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
distributed::RPCClient* client = distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
PADDLE_ENFORCE(client != nullptr); PADDLE_ENFORCE(client != nullptr);
std::thread server_thread(StartServer, distributed::kRequestSend); std::thread server_thread(StartServer, distributed::kRequestSend);
g_rpc_service->WaitServerReady(); g_rpc_service->WaitServerReady();
......
...@@ -79,6 +79,7 @@ message VariableMessage { ...@@ -79,6 +79,7 @@ message VariableMessage {
// server stops profiling and generates a profile to /tmp/profile_ps_* // server stops profiling and generates a profile to /tmp/profile_ps_*
// when profile switches from 1 to 2. // when profile switches from 1 to 2.
int64 profile = 11; int64 profile = 11;
int64 trainer_id = 12;
} }
message VoidMessage {} message VoidMessage {}
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#include <vector> #include <vector>
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
DEFINE_string(rpc_server_profile_path, "./profile_ps",
"the profile log file path");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
DECLARE_string(rpc_server_profile_path);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
...@@ -92,6 +94,8 @@ class VariableResponse { ...@@ -92,6 +94,8 @@ class VariableResponse {
return scope_->FindVar(meta_.varname()); return scope_->FindVar(meta_.varname());
} }
int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
protected: protected:
bool ReadRaw(::google::protobuf::io::CodedInputStream* input, bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& dev_ctx, platform::Place place, const platform::DeviceContext& dev_ctx, platform::Place place,
......
...@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>); ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -391,7 +391,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( ...@@ -391,7 +391,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int j = blockIdx.x; int j = blockIdx.x;
int i = threadIdx.x; int i = threadIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
T val = 0; T val(0);
do { do {
int x_offset = i * w + j; int x_offset = i * w + j;
...@@ -459,7 +459,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( ...@@ -459,7 +459,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x; int tid = threadIdx.x;
int j = blockIdx.x; int j = blockIdx.x;
T val = 0; T val(0);
int ttid = tid; int ttid = tid;
while (true) { while (true) {
......
...@@ -37,7 +37,8 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -37,7 +37,8 @@ class FetchBarrierOp : public framework::OperatorBase {
const platform::Place& place) const override { const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints"); std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>(
Attr<int>("trainer_id"));
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
...@@ -61,6 +62,7 @@ This operator will send a send barrier signal to list_and_serv op, so that ...@@ -61,6 +62,7 @@ This operator will send a send barrier signal to list_and_serv op, so that
the Parameter Server would knew all variables have been sent. the Parameter Server would knew all variables have been sent.
)DOC"); )DOC");
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<std::string>>("endpoints", AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.") "Server endpoints to send variables to.")
......
...@@ -61,7 +61,7 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -61,7 +61,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::vector<std::string> endpoint_list = std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("endpoint_list"); Attr<std::vector<std::string>>("endpoint_list");
distributed::RPCClient* client = distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
for (auto& ep : endpoint_list) { for (auto& ep : endpoint_list) {
VLOG(3) << "sending nccl id to " << ep; VLOG(3) << "sending nccl id to " << ep;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
using ScopedSpatialTransformerDescriptor =
platform::ScopedSpatialTransformerDescriptor;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
auto* output = ctx.Output<Tensor>("Output");
int n = input->dims()[0];
int c = input->dims()[1];
int h = input->dims()[2];
int w = input->dims()[3];
const int size[4] = {n, c, h, w};
const T* input_data = input->data<T>();
const T* grid_data = grid->data<T>();
T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, size);
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize2int(input->dims()));
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize2int(output->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward(
handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
output_data));
}
};
template <typename T>
class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
auto output_grad_dims = output_grad->dims();
const int n = output_grad_dims[0];
const int c = output_grad_dims[1];
const int h = output_grad_dims[2];
const int w = output_grad_dims[3];
const int size[4] = {n, c, h, w};
ScopedSpatialTransformerDescriptor st_dest;
cudnnSpatialTransformerDescriptor_t cudnn_st_dest =
st_dest.descriptor<T>(4, size);
const T* input_data = input->data<T>();
const T* grid_data = grid->data<T>();
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data =
input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
T* grid_grad_data =
grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor input_grad_desc;
ScopedTensorDescriptor output_grad_desc;
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize2int(input->dims()));
cudnnTensorDescriptor_t cudnn_input_grad_desc =
input_grad_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize2int(input_grad->dims()));
cudnnTensorDescriptor_t cudnn_output_grad_desc =
output_grad_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize2int(output_grad->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward(
handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
output_grad_data, grid_data, CudnnDataType<T>::kZero(),
grid_grad_data));
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_KERNEL(grid_sampler, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNGridSampleOpKernel<float>,
paddle::operators::CUDNNGridSampleOpKernel<double>);
REGISTER_OP_KERNEL(grid_sampler_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNGridSampleGradOpKernel<float>,
paddle::operators::CUDNNGridSampleGradOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/grid_sampler_op.h"
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class GridSampleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of GridSampleOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grid"),
"Input(Grid) of GridSampleOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of GridSampleOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid");
PADDLE_ENFORCE(x_dims.size() == 4,
"Input(X) of GridSampleOp should be 4-D Tensor.");
PADDLE_ENFORCE(grid_dims.size() == 4,
"Input(Grid) of GridSampleOp should be 4-D Tensor.");
PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
"Input(X) and Input(Grid) dims[0] should be equal.");
PADDLE_ENFORCE_EQ(
grid_dims[1], x_dims[2],
"Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
PADDLE_ENFORCE_EQ(
grid_dims[2], x_dims[3],
"Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
ctx->SetOutputDim("Output", x_dims);
ctx->ShareLoD("X", "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
framework::DataLayout::kAnyLayout, library_);
}
};
class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input data of GridSampleOp, "
"This is a 4-D tensor with shape of [N, C, H, W]");
AddInput(
"Grid",
"(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
"This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
"of x and y coordinates with shape [N, H, W] in last dimention");
AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
AddAttr<bool>(
"use_cudnn",
"(bool, default true) Only used in cudnn kernel, need install cudnn")
.SetDefault(true);
AddComment(R"DOC(
This operation samples input X by using bilinear interpolation based on
flow field grid, which is usually gennerated by affine_grid. The grid of
shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
with shape [N, H, W] each, where grid_x is indexing the 4th dimension
(in width dimension) of input data x and grid_y is indexng the 3rd
dimention (in height dimension), finally results is the bilinear
interpolation value of 4 nearest corner points.
Step 1:
Get (x, y) grid coordinates and scale to [0, H-1/W-1].
grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
Step 2:
Indices input data X with grid (x, y) in each [H, W] area, and bilinear
interpolate point value by 4 nearest points.
wn ------- y_n ------- en
| | |
| d_n |
| | |
x_w --d_w-- grid--d_e-- x_e
| | |
| d_s |
| | |
ws ------- y_s ------- wn
x_w = floor(x) // west side x coord
x_e = x_w + 1 // east side x coord
y_n = floor(y) // north side y coord
y_s = y_s + 1 // south side y coord
d_w = grid_x - x_w // distance to west side
d_e = x_e - grid_x // distance to east side
d_n = grid_y - y_n // distance to north side
d_s = y_s - grid_y // distance to south side
wn = X[:, :, y_n, x_w] // north-west point value
en = X[:, :, y_n, x_e] // north-east point value
ws = X[:, :, y_s, x_w] // south-east point value
es = X[:, :, y_s, x_w] // north-east point value
output = wn * d_e * d_s + en * d_w * d_s
+ ws * d_e * d_n + es * d_w * d_n
)DOC");
}
};
class GridSampleOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto input_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
}
if (ctx->HasOutput(framework::GradVarName("Grid"))) {
ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
framework::DataLayout::kAnyLayout, library_);
}
};
class GridSampleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("grid_sampler_grad");
op->SetInput("X", Input("X"));
op->SetInput("Grid", Input("Grid"));
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Grid"), InputGrad("Grid"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
ops::GridSampleGradMaker);
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
REGISTER_OP_CPU_KERNEL(
grid_sampler,
ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
grid_sampler_grad,
ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Array3 = Eigen::DSizes<int64_t, 3>;
using Array4 = Eigen::DSizes<int64_t, 4>;
template <typename T>
static inline bool isInBound(T x, T y, T x_max, T y_max) {
if (x < 0 || x > x_max || y < 0 || y > y_max) {
return false;
}
return true;
}
template <typename T>
static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
const Tensor& grid, Tensor* x_w, Tensor* x_e,
Tensor* y_n, Tensor* y_s, Tensor* d_w,
Tensor* d_e, Tensor* d_n, Tensor* d_s) {
auto& place = *ctx.eigen_device();
const int n = grid.dims()[0];
const int h = grid.dims()[1];
const int w = grid.dims()[2];
const T x_max = static_cast<T>(w - 1);
const T y_max = static_cast<T>(h - 1);
// split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
Tensor grid_x, grid_y;
T* grid_x_data = grid_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
T* grid_y_data = grid_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
const T* grid_data = grid.data<T>();
for (int i = 0; i < n * h * w; i++) {
grid_x_data[i] = grid_data[2 * i];
grid_y_data[i] = grid_data[(2 * i) + 1];
}
Tensor ones;
ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
Tensor half_xmax;
Tensor half_ymax;
half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_xmax_t = EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_ymax_t = EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
// scale grid to [0, h-1/w-1]
auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
// calculate coords of 4 corner points
x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
y_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
auto x_w_t = EigenTensor<T, 3>::From(*x_w);
auto x_e_t = EigenTensor<T, 3>::From(*x_e);
auto y_n_t = EigenTensor<T, 3>::From(*y_n);
auto y_s_t = EigenTensor<T, 3>::From(*y_s);
x_w_t.device(place) = grid_x_t.floor();
x_e_t.device(place) = x_w_t + ones_t;
y_n_t.device(place) = grid_y_t.floor();
y_s_t.device(place) = y_n_t + ones_t;
// calculate distances to 4 sides
d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
d_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
auto d_w_t = EigenTensor<T, 3>::From(*d_w);
auto d_e_t = EigenTensor<T, 3>::From(*d_e);
auto d_n_t = EigenTensor<T, 3>::From(*d_n);
auto d_s_t = EigenTensor<T, 3>::From(*d_s);
d_w_t.device(place) = grid_x_t - x_w_t;
d_e_t.device(place) = x_e_t - grid_x_t;
d_n_t.device(place) = grid_y_t - y_n_t;
d_s_t.device(place) = y_s_t - grid_y_t;
}
template <typename T>
static void GetGridPointValue(const Tensor& input, Tensor* output,
const Tensor& x, const Tensor& y) {
const int n = input.dims()[0];
const int c = input.dims()[1];
const int h = input.dims()[2];
const int w = input.dims()[3];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
auto input_t = EigenTensor<T, 4>::From(input);
for (int i = 0; i < n; i++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
for (int j = 0; j < c; j++) {
output_t(i, j, k, l) =
input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l))));
}
}
}
}
}
}
template <typename T>
static void GatherOutputGradToInputGrad(const Tensor& output_grad,
Tensor* input_grad, const Tensor& x,
const Tensor& y, const Tensor& d1,
const Tensor& d2) {
const int n = output_grad.dims()[0];
const int c = output_grad.dims()[1];
const int h = output_grad.dims()[2];
const int w = output_grad.dims()[3];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto d1_t = EigenTensor<T, 3>::From(d1);
auto d2_t = EigenTensor<T, 3>::From(d2);
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int i = 0; i < n; i++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
for (int j = 0; j < c; j++) {
input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l)))) +=
output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
}
}
}
}
}
}
template <typename DeviceContext, typename T>
class GridSampleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
const int n = input->dims()[0];
const int c = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
// calc locations and distances of 4 corner points
Tensor x_w, x_e, y_n, y_s;
Tensor d_w, d_e, d_n, d_s;
CalcGridLocations<T>(
ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
&x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
auto* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), output,
static_cast<T>(0));
// calc 4 corner points value
Tensor v_wn, v_en, v_ws, v_es;
v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
GetGridPointValue<T>(*input, &v_en, x_e, y_n);
GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
GetGridPointValue<T>(*input, &v_es, x_e, y_s);
auto d_w_t = EigenTensor<T, 3>::From(d_w);
auto d_e_t = EigenTensor<T, 3>::From(d_e);
auto d_n_t = EigenTensor<T, 3>::From(d_n);
auto d_s_t = EigenTensor<T, 3>::From(d_s);
auto d_w_scaled_t =
d_w_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
auto d_e_scaled_t =
d_e_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
auto d_n_scaled_t =
d_n_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
auto d_s_scaled_t =
d_s_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
auto v_en_t = EigenTensor<T, 4>::From(v_en);
auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
auto v_es_t = EigenTensor<T, 4>::From(v_es);
auto output_t = EigenTensor<T, 4>::From(*output);
// bilinear interpolaetion by 4 corner points
output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
v_en_t * d_w_scaled_t * d_s_scaled_t +
v_ws_t * d_e_scaled_t * d_n_scaled_t +
v_es_t * d_w_scaled_t * d_n_scaled_t;
}
};
template <typename DeviceContext, typename T>
class GridSampleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
const int n = input->dims()[0];
const int c = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), input_grad,
static_cast<T>(0));
auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), grid_grad,
static_cast<T>(0));
Tensor x_w, x_e, y_n, y_s;
Tensor d_w, d_e, d_n, d_s;
CalcGridLocations<T>(
ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
&x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
// gather output grad value to input grad by corner point coords and weight
GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e,
d_s);
GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_s, d_e,
d_n);
GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_n, d_w,
d_s);
GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_s, d_w,
d_n);
// calc 4 corner points value
Tensor v_wn, v_en, v_ws, v_es;
v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
GetGridPointValue<T>(*input, &v_en, x_e, y_n);
GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
GetGridPointValue<T>(*input, &v_es, x_e, y_s);
auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
auto v_en_t = EigenTensor<T, 4>::From(v_en);
auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
auto v_es_t = EigenTensor<T, 4>::From(v_es);
auto d_w_t = EigenTensor<T, 3>::From(d_w);
auto d_e_t = EigenTensor<T, 3>::From(d_e);
auto d_n_t = EigenTensor<T, 3>::From(d_n);
auto d_s_t = EigenTensor<T, 3>::From(d_s);
auto output_grad_t = EigenTensor<T, 4>::From(*output_grad);
Tensor grid_grad_x, grid_grad_y;
grid_grad_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
grid_grad_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto grid_grad_x_t = EigenTensor<T, 3>::From(grid_grad_x).setConstant(0.0);
auto grid_grad_y_t = EigenTensor<T, 3>::From(grid_grad_y).setConstant(0.0);
for (int i = 0; i < n; i++) {
for (int j = 0; j < c; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
grid_grad_x_t(i, k, l) +=
((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
(v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
output_grad_t(i, j, k, l);
grid_grad_y_t(i, k, l) +=
((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
(v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
output_grad_t(i, j, k, l);
}
}
}
}
const T x_max = static_cast<T>(w - 1);
const T y_max = static_cast<T>(h - 1);
grid_grad_x_t = grid_grad_x_t * (x_max / (T)2);
grid_grad_y_t = grid_grad_y_t * (y_max / (T)2);
// gather grid_grad [x, y] in 3rd Dim
T* grid_grad_data = grid_grad->data<T>();
T* grid_grad_x_data = grid_grad_x.data<T>();
T* grid_grad_y_data = grid_grad_y.data<T>();
for (int i = 0; i < n * h * w; i++) {
grid_grad_data[2 * i] = grid_grad_x_data[i];
grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
}
}
};
} // namespace operators
} // namespace paddle
...@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop( ...@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop(
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
while (true) { while (true) {
rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(distributed::kRequestSend); rpc_service_->SetCond(distributed::kRequestSend);
...@@ -218,23 +217,26 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -218,23 +217,26 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
framework::Scope *recv_scope) const { framework::Scope *recv_scope) const {
VLOG(2) << "RunAsyncLoop"; VLOG(2) << "RunAsyncLoop";
// grad name to block id
std::unordered_map<std::string, int32_t> grad_to_block_id;
std::unordered_map<int32_t, std::string> id_to_grad;
auto grad_to_block_id_str = auto grad_to_block_id_str =
Attr<std::vector<std::string>>("grad_to_block_id"); Attr<std::vector<std::string>>("grad_to_block_id");
for (const auto &grad_and_id : grad_to_block_id_str) { DoubleFindMap<std::string, int32_t> grad_to_block_id;
auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
const std::string &grad_and_id) {
std::vector<std::string> pieces; std::vector<std::string> pieces;
split(grad_and_id, ':', &pieces); split(grad_and_id, ':', &pieces);
VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1]; VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(pieces.size(), 2);
PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0); PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
int block_id = std::stoi(pieces[1]); int block_id = std::stoi(pieces[1]);
grad_to_block_id[pieces[0]] = block_id; (*out_map)[pieces[0]] = block_id;
id_to_grad[block_id] = pieces[0]; };
for (const auto &grad_and_id : grad_to_block_id_str) {
append_block_maps(&grad_to_block_id, grad_and_id);
} }
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); "server program should have at least 2 blocks");
...@@ -244,15 +246,22 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -244,15 +246,22 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
block_list.push_back(blkid); block_list.push_back(blkid);
} }
auto optimize_prepared = executor->Prepare(*program, block_list); auto optimize_prepared = executor->Prepare(*program, block_list);
// execute global block if needed // execute global block if needed, block id 1 in the program is global
if (block_list[0] == 1 && id_to_grad.count(1) == 0) { // block if it's not bind to a grad var for it's update.
if (block_list[0] == 1 &&
grad_to_block_id.find_value(static_cast<int32_t>(1)) ==
grad_to_block_id.end()) {
executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope); executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
} }
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
grad_to_prepared_ctx; grad_to_prepared_ctx, param_to_prepared_ctx;
for (size_t i = 0; i < block_list.size(); ++i) { for (size_t i = 0; i < block_list.size(); ++i) {
grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i]; auto blkid = block_list[i];
auto it = grad_to_block_id.find_value(blkid);
if (it != grad_to_block_id.end()) {
grad_to_prepared_ctx[it->first] = optimize_prepared[i];
}
} }
request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
...@@ -315,6 +324,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -315,6 +324,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
framework::Scope &recv_scope = scope.NewScope(); framework::Scope &recv_scope = scope.NewScope();
bool sync_mode = Attr<bool>("sync_mode"); bool sync_mode = Attr<bool>("sync_mode");
bool dc_sgd = Attr<bool>("dc_asgd");
auto fan_in = Attr<int>("Fanin"); auto fan_in = Attr<int>("Fanin");
auto inputs = Inputs("X"); auto inputs = Inputs("X");
...@@ -328,8 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -328,8 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode)); request_send_handler_.reset(
request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode)); new distributed::RequestSendHandler(sync_mode, dc_sgd));
request_get_handler_.reset(
new distributed::RequestGetHandler(sync_mode, dc_sgd));
request_prefetch_handler_.reset( request_prefetch_handler_.reset(
new distributed::RequestPrefetchHandler(sync_mode)); new distributed::RequestPrefetchHandler(sync_mode));
request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler( request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
...@@ -443,6 +455,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -443,6 +455,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
"a map from grad name to it's optimize block id") "a map from grad name to it's optimize block id")
.SetDefault({}); .SetDefault({});
AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true); AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
.SetDefault(false);
AddAttr<std::vector<framework::BlockDesc *>>( AddAttr<std::vector<framework::BlockDesc *>>(
kOptimizeBlocks, "Optimize blocks to run on server side.") kOptimizeBlocks, "Optimize blocks to run on server side.")
.SetDefault({}); .SetDefault({});
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <atomic> #include <atomic>
#include <set> #include <set>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
...@@ -37,6 +38,17 @@ constexpr char kCheckpointBlockId[] = "checkpint_block_id"; ...@@ -37,6 +38,17 @@ constexpr char kCheckpointBlockId[] = "checkpint_block_id";
void RunServer(std::shared_ptr<distributed::RPCServer> service); void RunServer(std::shared_ptr<distributed::RPCServer> service);
template <class TKey, class TValue>
class DoubleFindMap : public std::unordered_map<TKey, TValue> {
public:
typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
return std::find_if(this->begin(), this->end(),
[&v](const std::pair<const std::string, int> p) {
return p.second == v;
});
}
};
class ListenAndServOp : public framework::OperatorBase { class ListenAndServOp : public framework::OperatorBase {
public: public:
ListenAndServOp(const std::string& type, ListenAndServOp(const std::string& type,
......
...@@ -76,8 +76,12 @@ endif() ...@@ -76,8 +76,12 @@ endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
if (NOT WIN32) if (NOT WIN32)
cc_library(jit_kernel set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
DEPS cpu_info cblas) if(WITH_XBYAK)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
list(APPEND JIT_KERNEL_DEPS xbyak)
endif()
cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
endif() endif()
...@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> { ...@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
T* dy) const { T* dy) const {
const int block_size = 512; const int block_size = 512;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, (rows + block_size - 1) / block_size); dim3 grid((rows + block_size - 1) / block_size, 1);
CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>( CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
x_norm, y_norm, x, y, z, dz, rows, cols, dy); x_norm, y_norm, x, y, z, dz, rows, cols, dy);
} }
......
...@@ -21,6 +21,16 @@ namespace operators { ...@@ -21,6 +21,16 @@ namespace operators {
namespace math { namespace math {
namespace { namespace {
__device__ __forceinline__ float real_log(float x) { return logf(x); }
__device__ __forceinline__ double real_log(double x) { return log(x); }
__device__ __forceinline__ platform::float16 real_log(
const platform::float16& val) {
return static_cast<platform::float16>(logf(static_cast<float>(val)));
}
template <typename T> template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D, const int N, const int D,
...@@ -29,8 +39,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, ...@@ -29,8 +39,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
i += blockDim.x * gridDim.x) { i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index); PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
Y[i] = ignore_index == label[i] Y[i] = ignore_index == label[i]
? 0 ? static_cast<T>(0)
: -math::TolerableValue<T>()(log(X[i * D + label[i]])); : -math::TolerableValue<T>()(real_log(X[i * D + label[i]]));
} }
} }
...@@ -38,12 +48,12 @@ template <typename T> ...@@ -38,12 +48,12 @@ template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) { const int class_num) {
int tid = threadIdx.x; int tid = threadIdx.x;
T val = 0; T val(0);
int idx = blockIdx.x * class_num + tid; int idx = blockIdx.x * class_num + tid;
int end = blockIdx.x * class_num + class_num; int end = blockIdx.x * class_num + class_num;
for (; idx < end; idx += blockDim.x) { for (; idx < end; idx += blockDim.x) {
val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx]; val += math::TolerableValue<T>()(real_log(X[idx])) * label[idx];
} }
val = paddle::platform::reduceSum(val, tid, blockDim.x); val = paddle::platform::reduceSum(val, tid, blockDim.x);
...@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, ...@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
} }
} // namespace } // namespace
using Tensor = framework::Tensor;
template <typename T> template <typename T>
class CrossEntropyFunctor<platform::CUDADeviceContext, T> { class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
public: public:
...@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>; template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>; template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册