diff --git a/.gitignore b/.gitignore index 10a4262aa7e129c48d79fbe7d978720b28f4bcea..369fa1cb919c82caec326d1429c8a2eba3b928d6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +python/paddle/fluid/tests/unittests/reader_reset_test.recordio paddle/operators/check_t.save paddle/operators/check_tensor.ls paddle/operators/tensor.save diff --git a/AUTHORS.md b/AUTHORS.md index 4060f75613ac4dadf353ff53a73fd0647a8052be..54a1097b50f7a09062f8987e62db6b5f5e89e0b7 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -42,6 +42,7 @@ | QiJune | Jun Qi | | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | +| Sand3r- | Michal Gallus | | Superjom | Chun-Wei Yan | | tensor-tang | Jian Tang | | tianbingsz | Tian-Bing Xu | diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 729bdcb3dc5324df0a5272402ef203012be0072a..7355b67ab1020f58760f23b1a20ca189591db35e 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -166,8 +166,8 @@ copy(framework_lib DEPS ${framework_lib_deps} set(module "memory") copy(memory_lib - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation ) set(inference_deps paddle_fluid_shared paddle_fluid) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 42af482f852a1f32fa0fb5595b7c533d612f7d02..43e1bc6b2efecd24d2d5bade5b6a7727bfb0a607 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -31,9 +31,7 @@ function(windows_symbolic TARGET) endfunction() add_subdirectory(ir) -if (NOT WIN32) add_subdirectory(details) -endif (NOT WIN32) # ddim lib proto_library(framework_proto SRCS framework.proto) @@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) -if (NOT WIN32) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) -else() -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor) -endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -179,12 +172,10 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -if (NOT WIN32) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor) -endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 949616f02d5168e6abab932d608e4b20ee64304a..c3a8b85423403992e3a12ceb0a1acbae82d25dfa 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -13,9 +13,9 @@ // limitations under the License. #pragma once +#include #include #include -#include "ThreadPool.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 40b0130b265471a1288d966c4cbcd4f0e1bdb9f1..6918e030bf859bc8a55baed9d944e16217b0efb6 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -100,6 +100,7 @@ class OperatorBase { const std::string& Type() const { return type_; } + bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index eb89fc5e1124e97b082d6299e3efc44591a8b01b..0c73778b201d77a6e8a35a38d17f2a86d5faaca9 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -7,16 +7,17 @@ set(analysis_deps # analysis_deps can be extended accross the project add_subdirectory(ir_passes) add_subdirectory(passes) -cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES}) +cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api) + +cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper) cc_library(argument SRCS argument.cc DEPS scope proto_desc) cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc) cc_library(analysis SRCS analyzer.cc - helper.cc analysis_pass - DEPS ${analysis_deps} + DEPS ${analysis_deps} analysis_helper ) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 48fc5dda2a5bfa24d679d4bf655e580dafc614b3..84a0c3374c66f85313828332099cb372e14c7c83 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -30,6 +30,7 @@ TEST(Analyzer, analysis_without_tensorrt) { Argument argument; argument.SetModelDir(FLAGS_inference_model_dir); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + argument.SetUseGPU(false); Analyzer analyser; analyser.Run(&argument); @@ -41,6 +42,7 @@ TEST(Analyzer, analysis_with_tensorrt) { argument.SetTensorRtWorkspaceSize(1 << 20); argument.SetModelDir(FLAGS_inference_model_dir); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + argument.SetUseGPU(false); Analyzer analyser; analyser.Run(&argument); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index d7a2f3d1e3a3251263c8670aef5db538fa2c48ea..21203e2d9f4e4cd22ea49ea7b6808aff07e70eff 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -116,6 +116,7 @@ struct Argument { std::vector); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); + DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller, std::function); diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index c71cff889ed7cdb95f79b9bc89a9ca5ab370271c..822c7799bb3ae6d79da6cf2a7b3c8c9b20353ed7 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -4,4 +4,6 @@ set(analysis_deps ${analysis_deps} subgraph_detector tensorrt_subgraph_pass CACHE INTERNAL "") +set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) +file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index 267737e95cb51f9a2d72b70b171be779d6f52c8d..108cb6f74b1208395a4faabdf6184152c300d244 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -46,7 +46,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", - "conv2d_transpose"}); + "conv2d_transpose", "leaky_relu"}); if (!node->IsOp()) return false; if (teller_set.count(node->Op()->Type())) { diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index a30fef08b5726c965637e2fb489bdb2036bd2a8d..d5e0d90de1da8e54e2411c266f7a8c09c33b0336 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -30,15 +30,28 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { if (!argument->scope_valid()) { argument->SetScope(new framework::Scope); } + PADDLE_ENFORCE(argument->use_gpu_valid()); + + // The load program should run on the same device with the inference program, + // so that the parameters will on the same device, or they will keep copying + // between difference devices. + platform::Place place; + if (argument->use_gpu()) { + PADDLE_ENFORCE(argument->gpu_device_id_valid()); + place = platform::CUDAPlace(argument->gpu_device_id()); + } else { + place = platform::CPUPlace(); + } if (argument->model_dir_valid()) { - auto program = LoadModel(argument->model_dir(), argument->scope_ptr()); + auto program = + LoadModel(argument->model_dir(), argument->scope_ptr(), place); argument->SetMainProgram(program.release()); } else if (argument->model_program_path_valid() && argument->model_params_path_valid()) { auto program = LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr()); + argument->scope_ptr(), place); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( @@ -52,16 +65,15 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { } std::unique_ptr IrGraphBuildPass::LoadModel( - const std::string &path, framework::Scope *scope) { - platform::CPUPlace place; + const std::string &path, framework::Scope *scope, + const platform::Place &place) { framework::Executor exe(place); return Load(&exe, scope, path); } std::unique_ptr IrGraphBuildPass::LoadModel( const std::string &program_path, const std::string ¶ms_path, - framework::Scope *scope) { - platform::CPUPlace place; + framework::Scope *scope, const platform::Place &place) { framework::Executor exe(place); return Load(&exe, scope, program_path, params_path); } diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h index 3291e4f6ad3ca3079e672350805cab1f1e7b2413..271e64fce579bc9001b1dd632576571cec949752 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -17,6 +17,7 @@ #include #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace inference { @@ -32,11 +33,12 @@ class IrGraphBuildPass : public AnalysisPass { std::string repr() const override; private: - std::unique_ptr LoadModel(const std::string &path, - framework::Scope *scope); + std::unique_ptr LoadModel( + const std::string &path, framework::Scope *scope, + const platform::Place &place); std::unique_ptr LoadModel( const std::string &program_path, const std::string ¶ms_path, - framework::Scope *scope); + framework::Scope *scope, const platform::Place &place); std::string model_binary_str_; }; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 82f74a269a5915dfa1d97a28f5ae15a12ea0b154..e9969b84f33483b048951f704de1e13e51cbeaea 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -27,11 +27,10 @@ endif() cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) -cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) -cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) - +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager) +cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce) +cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor) cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d19505877bbc1110fcf5787fffc1436d242a7cdc..cb14d2a2602808bd35106ed2bafcf7975f549597 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -285,6 +285,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { status_program_optimized_ = true; argument_.SetUseGPU(config_.use_gpu); + argument_.SetGPUDeviceId(config_.device); // Analyze inference_program if (!config_.model_dir.empty()) { argument_.SetModelDir(config_.model_dir); @@ -491,8 +492,7 @@ bool AnalysisPredictor::LoadParameters() { } // Use NaiveExecutor to Load parameters. - platform::CPUPlace place; - framework::NaiveExecutor e(place); + framework::NaiveExecutor e(place_); e.Prepare(scope_.get(), *load_program, 0, false); e.Run(); VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load"; @@ -551,4 +551,5 @@ USE_TRT_CONVERTER(pad); USE_TRT_CONVERTER(split); USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(conv2d_transpose); +USE_TRT_CONVERTER(leaky_relu); #endif diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 825bee833bf918067497f56adebbbcaf55f892a2..12e3a6f42e14010feedbbb5d8f8a98f60cea4556 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -116,8 +116,12 @@ class CpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { + // TODO(NHZlX) Problem with Data synchronization between GPU and CPU + // When running in GPU mode, the parameters are all on GPU. But the + // opearations of "conv_bn_fuse_pass" are on CPU. passes_.assign({ - "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", + // "infer_clean_graph_pass", "conv_bn_fuse_pass", }); } diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 8dd6e8453f971b980a3dcb95c10734537e8333dd..27fb41d16ead65a1ec075399bcda135e2238c7ba 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -2,7 +2,7 @@ nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc - pad_op.cc split_op.cc prelu_op.cc + pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -38,3 +38,5 @@ nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin prelu_op SERIAL) +nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc + DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f6ed04c46d70b1ab68b4c01ef0c908a1a8d1a19 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// LeakyRelu converter from fluid to tensorRT +class LeakyReluOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid leaky_relu op to tensorrt layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE(input_num == 1); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE(output_num == 1); + // Get attrs + float alpha = boost::get(op_desc.GetAttr("alpha")); + + platform::CPUPlace place; + std::unique_ptr alpha_tensor( + new framework::LoDTensor()); + alpha_tensor->Resize(framework::make_ddim({2})); + float* alpha_data = alpha_tensor->mutable_data(place); + alpha_data[0] = alpha; + alpha_data[1] = 1.f - alpha; + // the leaky relu formula y = (x > 0) ? x : alpha * x is equal to + // y = alpha * x + (x > 0) ? (1 - alpha) * x : 0 + TensorRTEngine::Weight scale{nvinfer1::DataType::kFLOAT, &alpha_data[0], 1}; + TensorRTEngine::Weight shift{nvinfer1::DataType::kFLOAT, nullptr, 0}; + TensorRTEngine::Weight power{nvinfer1::DataType::kFLOAT, nullptr, 0}; + // y_scale = alpha * x + auto* scale_layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift.get(), + scale.get(), power.get()); + PADDLE_ENFORCE(nullptr != scale_layer); + // y_relu = (x > 0) : x : 0 + auto* relu_layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input, + nvinfer1::ActivationType::kRELU); + PADDLE_ENFORCE(nullptr != relu_layer); + // + TensorRTEngine::Weight sub_scale{nvinfer1::DataType::kFLOAT, &alpha_data[1], + 1}; + auto* scale_relu_layer = + TRT_ENGINE_ADD_LAYER(engine_, Scale, *(relu_layer->getOutput(0)), + nvinfer1::ScaleMode::kUNIFORM, shift.get(), + sub_scale.get(), power.get()); + PADDLE_ENFORCE(nullptr != scale_relu_layer); + auto* output_layer = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)), + *(scale_relu_layer->getOutput(0)), + nvinfer1::ElementWiseOperation::kSUM); + PADDLE_ENFORCE(nullptr != output_layer); + // keep alpha tensor to avoid release it's memory + std::string alpha_name = op_desc.Output("Out")[0] + "_alpha"; + PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) == + engine_->weight_map.end()); + engine_->weight_map[alpha_name] = std::move(alpha_tensor); + + std::string layer_name = "leaky_relu (Output: "; + auto output_name = op_desc.Output("Out")[0]; + output_layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, output_layer->getOutput(0)); + layer_name += output_name; + if (test_mode) { + engine_->DeclareOutput(output_name); + } + output_layer->setName((layer_name + ")").c_str()); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d00826af075159004d3727a7519e7c319dbddb02 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(leaky_relu_op, test_leaky_relu) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("leaky_relu"); + desc.SetInput("X", {"leaky_relu_input"}); + desc.SetOutput("Out", {"leaky_relu_out"}); + + desc.SetAttr("alpha", 0.1f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +// USE_OP(leaky_relu); +USE_OP(leaky_relu); diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 40902694995f487343e3cfc727ad95daafa6b415..a0329325bea19bd9cdd3fcd39724cf05664b505a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,3 +1,3 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu - DEPS enforce device_context) + DEPS enforce tensorrt_engine) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 16a9b50e6fb174374d23cd021e47e52921871a8a..e8bd13037ed6c2c3c639b76f6f3561921fb6ee37 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,5 +1,9 @@ set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +if(WITH_GPU AND TENSORRT_FOUND) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) +endif() + function(download_model install_dir model_name) if (NOT EXISTS ${install_dir}) inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name}) @@ -27,14 +31,14 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename endfunction() # RNN1 -if(NOT APPLE) +if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) else() - # TODO: fix this test on MACOS, the reason is that - # fusion_seqexpand_concat_fc_op is not supported on MACOS - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1") + # TODO: fix this test on MACOS and OPENBLAS, the reason is that + # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS + message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1") endif() # RNN2 @@ -75,11 +79,11 @@ endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) # resnet50 -inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 +inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") # mobilenet with depthwise_conv op -inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet +inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") # anakin @@ -89,15 +93,15 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1") inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin") inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt") - cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc - ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin + cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc + ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt DEPS inference_anakin_api_shared SERIAL) # anakin mobilenet if(WITH_GPU) set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet") inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin") - cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc + cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin DEPS inference_anakin_api_shared dynload_cuda SERIAL) endif() @@ -109,6 +113,6 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") endif() inference_analysis_test(test_trt_models SRCS trt_models_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index e66ae2805766754d9d07877c31889dd421daf9f1..7b686045a59c93a93322f99c2cdf7050ddbf0a6d 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -222,19 +222,36 @@ void TestMultiThreadPrediction( // The inputs of each thread are all the same. std::vector outputs_tid; auto &predictor = predictors[tid]; - LOG(INFO) << "running thread " << tid; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (const auto &input : inputs) { - ASSERT_TRUE(predictor->Run(input, &outputs_tid)); + + // warmup run + LOG(INFO) << "Running thread " << tid << ", warm up run..."; + { + Timer warmup_timer; + warmup_timer.tic(); + predictor->Run(inputs[0], outputs, batch_size); + PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); +#if !defined(_WIN32) + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); } +#endif } - auto time = timer.toc(); - total_time += time; - PrintTime(batch_size, num_times, num_threads, tid, time / num_times, - inputs.size()); + LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + { + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (const auto &input : inputs) { + ASSERT_TRUE(predictor->Run(input, &outputs_tid)); + } + } + + auto time = timer.toc(); + total_time += time; + PrintTime(batch_size, num_times, num_threads, tid, time / num_times, + inputs.size()); + } }); } for (int i = 0; i < num_threads; ++i) { diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 922feba10fec5d1d13b47dbce064fce2e01d8998..ef612ce6148329c33f194842945bb5438afcf645 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -145,5 +145,3 @@ TEST(TensorRT_mobilenet, analysis) { } // namespace inference } // namespace paddle - -USE_PASS(tensorrt_subgraph_pass); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..10290a4aeff6b6a023fb28961d12728aff891e83 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -0,0 +1,201 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" + +#include "paddle/fluid/platform/mkldnn_helper.h" + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include "xbyak.h" +#include "xbyak_util.h" + +namespace paddle { +namespace operators { + +using framework::DataLayout; +using mkldnn::memory; + +static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) { + std::transform(format.begin(), format.end(), format.begin(), ::tolower); + + if (!format.compare("nchw")) { + return memory::format::nchw; + } else if (!format.compare("nchw16c")) { + return memory::format::nChw16c; + } else if (!format.compare("nchw8c")) { + return memory::format::nChw8c; + } else if (!format.compare("nhwc")) { + return memory::format::nhwc; + } else { + return memory::format::any; + } +} + +static void UpdateDataFormat(const framework::ExecutionContext& ctx, + framework::Tensor* tensor, const char* attribute) { + if (ctx.op().HasAttr(attribute)) { + auto format_as_string = ctx.Attr(attribute); + auto format = StringToMKLDNNFormat(format_as_string); + if (format != memory::format::any) { + tensor->set_format(format); + } + } +} + +template +static void ReorderInput(framework::Tensor* tensor, + const platform::Place& place, + const mkldnn::engine& engine, bool isFourDim) { + using platform::to_void_cast; + auto dims = paddle::framework::vectorize2int(tensor->dims()); + framework::Tensor out_tensor; + out_tensor.Resize(tensor->dims()); + out_tensor.set_format(isFourDim ? memory::format::nchw : memory::format::nc); + out_tensor.set_layout(tensor->layout()); + mkldnn::memory input_memory = { + {{dims, platform::MKLDNNGetDataType(), tensor->format()}, engine}, + to_void_cast(tensor->data())}; + mkldnn::memory output_memory = { + {{dims, platform::MKLDNNGetDataType(), out_tensor.format()}, engine}, + to_void_cast(out_tensor.mutable_data(place))}; + platform::Reorder(input_memory, output_memory); + tensor->ShareDataWith(out_tensor); +} + +template +class ElementwiseMulMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + int axis = ctx.Attr("axis"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + const T* x_data = x->data(); + const T* y_data = y->data(); + T* z_data = z->mutable_data(ctx.GetPlace()); + + auto x_dims = x->dims(); + auto y_dims_untrimmed = y->dims(); + auto x_int_dims = paddle::framework::vectorize2int(x_dims); + + UpdateDataFormat(ctx, (Tensor*)x, "x_data_format"); + UpdateDataFormat(ctx, (Tensor*)y, "y_data_format"); + + Xbyak::util::Cpu cpu; + const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F); + const bool are_dims_divisable = !(x_int_dims[1] % 16); + const bool is_x_format_correct = x->format() == memory::format::nChw16c; + const bool is_y_format_correct = y->format() == memory::format::nc; + if (is_x_format_correct && is_y_format_correct && are_dims_divisable && + is_avx512_enabled) { + int pre, n, post; + get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post); + + if (post == 1) { + PADDLE_THROW("Not implemented when post is 1"); + } else { + // Just check whether it works for RE-Resnext. + PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions"); + + int n = x_dims[0]; + int c = x_dims[1]; + int h = x_dims[2]; + int w = x_dims[3]; + + PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c, + "Y should be in nc format"); + + constexpr int simd_width = 16; + int C = c / simd_width; + + const auto& multiply = + math::jitkernel::KernelPool::Instance() + .template Get>(n); + +#pragma omp parallel for collapse(2) + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_z = + z_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + multiply->Compute(ptr_x, ptr_y, ptr_z, h, w); + } + } + } + + z->set_layout(DataLayout::kMKLDNN); + z->set_format(x->format()); + } else { + // Fallback to naive version: + const bool are_inputs_in_same_format = x->format() == y->format(); + const bool is_x_nchw = x->format() == memory::format::nchw; + const bool is_x_nc = x->format() == memory::format::nc; + const bool is_y_nchw = y->format() == memory::format::nchw; + const bool is_y_nc = y->format() == memory::format::nc; + if (!are_inputs_in_same_format) { + using platform::MKLDNNDeviceContext; + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + if (!(is_x_nchw || is_x_nc)) + ReorderInput((Tensor*)x, ctx.GetPlace(), mkldnn_engine, + x->dims().size() == 4); + if (!(is_y_nchw || is_y_nc)) + ReorderInput((Tensor*)y, ctx.GetPlace(), mkldnn_engine, + y->dims().size() == 4); + } + + auto mul_func = [](T a, T b) -> T { return a * b; }; + + TransformFunctor + functor( + x, y, z, + ctx.template device_context(), + mul_func); + + axis = (axis == -1 ? x_dims.size() - y_dims_untrimmed.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + auto y_dims = trim_trailing_singular_dims(y_dims_untrimmed); + axis = (y_dims.size() == 0) ? x_dims.size() : axis; + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); + + if (post == 1) { + functor.RunRowWise(n, pre); + } else { + functor.RunMidWise(n, pre, post); + } + z->set_layout(DataLayout::kMKLDNN); + z->set_format(x->format()); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_mul, MKLDNN, ::paddle::platform::CPUPlace, + ops::ElementwiseMulMKLDNNKernel) diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index f01f67692e1e5dd040971cb0dd1dd793648da97a..85a7817be9b3a82d40853b417d78a7fdf67f6c1f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -97,6 +97,20 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { .EqualGreaterThan(-1); AddAttr("use_mkldnn", "(bool, default false). Used by MKLDNN.") .SetDefault(false); + AddAttr( + "x_data_format", + "(string, default NCHW) Only used in mkldnn" + "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". " + "Defaults to \"\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault(""); + AddAttr( + "y_data_format", + "(string, default \"\") Only used in mkldnn" + "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". " + "Defaults to \"\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault(""); AddComment(string::Sprintf(R"DOC( Elementwise %s Operator diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 65f83ff4846601d1575daa994772cd869d526f56..64ef55de7cf73fea4538cc0d8fa6d316ddaff2f8 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -322,6 +322,42 @@ class VActJitCode : public JitCode { ymm_t ymm_dst = ymm_t(1); }; +#ifdef PADDLE_WITH_MKLDNN +struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator { + explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024) + : Xbyak::CodeGenerator(code_size) { + // RDI is ptr x_input + // RSI is ptr y_input + // RDX is ptr output + // RCX is height + // r8 is width + + push(rbx); + + xor_(rax, rax); + xor_(r10, r10); + vmovups(zmm3, ptr[rsi]); + + L("h_loop"); + xor_(rbx, rbx); + L("w_loop"); + vmovups(zmm2, ptr[rdi + rax]); + vmulps(zmm1, zmm2, zmm3); + vmovups(ptr[rdx + rax], zmm1); + add(rax, 64); + inc(rbx); + cmp(r8, rbx); + jnz("w_loop"); + inc(r10); + cmp(r10, rcx); + jnz("h_loop"); + + pop(rbx); + ret(); + } +}; +#endif + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 7e163c1349e73d8fe5e436b98c9a8f67e6439506..82d808f415c3b4ed2688d034aad13610ae2ab0f4 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -95,6 +95,15 @@ class VAddBiasKernel : public Kernel { void (*Compute)(const T *, const T *, T *, int); }; +#ifdef PADDLE_WITH_MKLDNN +template +class EltwiseMulnChw16cNCKernel : public Kernel { + public: + // nChw16c = nChw16c .* NC + void (*Compute)(const float *, const float *, float *, int, int); +}; +#endif + template class VActKernel : public Kernel { public: diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 36a50f20434f313e93bfa3dd2c9d46963024caf7..a143b51439f55d1f80d7936dfad46e31bd19f0cb 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -226,6 +226,44 @@ bool VAddKernelImpl::useMKL(int d) { } #endif +#ifdef PADDLE_WITH_MKLDNN +/* EltwiseMul for nChw16c & NC inputs JitKernel */ +template +class EltwiseMulnChw16cNCKernelImpl + : public math::jitkernel::EltwiseMulnChw16cNCKernel { + public: + JITKERNEL_DECLARE_STATIC_FUNC; + explicit EltwiseMulnChw16cNCKernelImpl(int d) + : EltwiseMulnChw16cNCKernel() { + using mul_func_t = void (*)(const float*, const float*, float*, int, int); +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + // roughly estimate the size of code + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8; + sz = sz > 4096 ? sz : 4096; + jitcode_.reset(new gen::EltwiseMulnChw16cNC(sz)); + this->Compute = (mul_func_t)jitcode_->getCode(); + return; + } +#endif + PADDLE_THROW( + "This kernel shouldn't be used in Non-Xbyak, Non-MKL-DNN " + "environemnt"); + } + +#ifdef PADDLE_WITH_XBYAK + + private: + std::unique_ptr jitcode_{nullptr}; +}; + +template <> +bool EltwiseMulnChw16cNCKernelImpl::useJIT(int d) { + return true; +} +#endif +#endif + /* VAddRelu JitKernel */ template class VAddReluKernelImpl : public VAddReluKernel { @@ -394,6 +432,9 @@ REGISTER_JITKERNEL(vscal, VScalKernel); REGISTER_JITKERNEL(vaddbias, VAddBiasKernel); REGISTER_JITKERNEL(vrelu, VReluKernel); REGISTER_JITKERNEL(videntity, VIdentityKernel); +#ifdef PADDLE_WITH_MKLDNN +REGISTER_JITKERNEL(eltwise_mul_nchw16c, EltwiseMulnChw16cNCKernel); +#endif } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 0d0613e1a4364e300640b62687c8a045e40b9ca9..93cb5eb2dc0b3480ebd05dcc6b36d8915d057bab 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,3 @@ -if (NOT WIN32) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _ add_dependencies(profiler_py_proto profiler_py_proto_init) +if (NOT WIN32) add_custom_command(TARGET profiler_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) +string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") +add_custom_command(TARGET profiler_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) if(WITH_GPU) @@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) - -if (NOT WIN32) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) -endif(NOT WIN32) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index f59fc40b71699a790978e22fd7e26da8d4d94c5f..eaf047d4744762f69d50bff8d467da8e3b8317cc 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,17 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 - -#include #include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" namespace paddle { @@ -32,15 +26,11 @@ namespace platform { /////////////////////// // WARN: Under Development. Don't depend on it yet. ////////////////////// -#if !defined(_WIN32) inline uint64_t PosixInNsec() { struct timeval tv; gettimeofday(&tv, nullptr); return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); } -#else -inline uint64_t PosixInNsec() { return static_cast(0); } -#endif // !_WIN32 // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 02639e2177f0ef2a2f704fce724defe11cc09045..0ccef6c6a8345e31cee3ef2422fe3f56c059c231 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -38,6 +38,7 @@ std::once_flag p2p_init_flag; void InitGflags(std::vector argv) { std::call_once(gflags_init_flag, [&]() { + FLAGS_logtostderr = true; argv.insert(argv.begin(), "dummy"); int argc = argv.size(); char **arr = new char *[argv.size()]; diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index a07b993c8a84e26fb079804b6ec3ff9df3efd09e..8be77fe46457511220a26b9733df2c4b8d013823 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -27,6 +28,7 @@ #include // dladdr #include // backtrace #include +#include #include // std::accumulate #else #include // _popen, _pclose @@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +static int gettimeofday(struct timeval *tp, void *tzp) { + time_t clock; + struct tm tm; + SYSTEMTIME wtm; + + GetLocalTime(&wtm); + tm.tm_year = wtm.wYear - 1900; + tm.tm_mon = wtm.wMonth - 1; + tm.tm_mday = wtm.wDay; + tm.tm_hour = wtm.wHour; + tm.tm_min = wtm.wMinute; + tm.tm_sec = wtm.wSecond; + tm.tm_isdst = -1; + clock = mktime(&tm); + tp->tv_sec = clock; + tp->tv_usec = wtm.wMilliseconds * 1000; + + return (0); +} #endif // !_WIN32 static void ExecShellCommand(const std::string &cmd, std::string *message) { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 56bf9e31a35fdec5b7f04849068ff96ac9776c0e..03c102e24a189ea9e1614ffa6773e0c6e742c107 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/port.h" -#include #include #include #include @@ -438,10 +438,10 @@ void ParseEvents(const std::vector>& events, event_items[index].total_time += event_time; // min time event_items[index].min_time = - std::min(event_time, event_items[index].min_time); + (std::min)(event_time, event_items[index].min_time); // max time event_items[index].max_time = - std::max(event_time, event_items[index].max_time); + (std::max)(event_time, event_items[index].max_time); } // remove the push marker from the list diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index e8eae874afa3d17f0d3374eef457cdbacb3f8424..f5d3490634f3199a23986ec3ae13d9fe3577ac35 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx); -#if !defined(_WIN32) struct RecordEvent { // dev_ctx can be set to nullptr if device is cpu. RecordEvent(const std::string& name, const DeviceContext* dev_ctx); @@ -106,15 +105,6 @@ struct RecordBlock { std::string name_; uint64_t start_ns_; }; -#else -// windows do not support profiler temporarily. -struct RecordEvent { - RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {} -}; -struct RecordBlock { - explicit RecordBlock(int block_id) {} -}; -#endif // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0e88a439cf6ca83e3d98725f58875adeeea86be0..11c68f3449ee26b64c121acd081479b37c94fac4 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -45,16 +45,15 @@ class StreamCallbackManager { inline void AddCallback(Callback &&callback) const { auto *stream_callback_context = new StreamCallbackContext(this, std::forward(callback)); - PADDLE_ENFORCE( #if CUDA_VERSION >= 10000 - cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context) + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); // NOLINT #else - cudaStreamAddCallback(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0) + PADDLE_ENFORCE(cudaStreamAddCallback( + stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); // NOLINT #endif - ); // NOLINT } void Wait() const { thread_pool_.reset(new ThreadPool(1)); } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 25e919105cba48ea28b6d429542c33ba48515ef5..fb6ee2f4a53925f64b61b9fe03f5a4bd7203ed53 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) -if(NOT WIN32) - list(APPEND PYBIND_DEPS parallel_executor profiler) -endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2f040e1c34c7956959dc9fe55b4aaf0b3b6f880c..102fa02adf3fb05e36183f87e95d8178be4e8742 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,9 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" -#ifndef _WIN32 #include "paddle/fluid/framework/parallel_executor.h" -#endif #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif -#ifndef _WIN32 py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); -#endif py::class_> pass(m, "Pass"); pass.def(py::init()) @@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. .def("remove_pass", [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); -#ifndef _WIN32 // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( @@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(fetch_tensors, fetched_var_name); }); -#endif BindRecordIOWriter(&m); return m.ptr(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 6a4a5e098fc27e02104baa837d939a62cc2a22a8..543acf2d349c7d02480204699f497536c7a4ca60 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -47,8 +47,7 @@ from . import profiler from . import unique_name from . import recordio_writer from . import parallel_executor -if os.name != 'nt': - from .parallel_executor import * +from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 0d53f53a9ef6d5f90d188e1ad10785b5558c3e80..3f4dd5eb712e738bbee8f93c062375033b8ab2f6 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -25,264 +25,263 @@ import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] -if os.name != 'nt': - ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy - BuildStrategy = core.ParallelExecutor.BuildStrategy - - class ParallelExecutor(object): +ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy +BuildStrategy = core.ParallelExecutor.BuildStrategy + + +class ParallelExecutor(object): + """ + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. + + Args: + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provide, it will share variables + from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int): Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + scope(Scope): scope to run with, default use fluid.global_scope(). + + Returns: + ParallelExecutor: The initialized ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) + """ + + def __init__(self, + use_cuda, + loss_name=None, + main_program=None, + share_vars_from=None, + exec_strategy=None, + build_strategy=None, + num_trainers=1, + trainer_id=0, + scope=None): + self._places = [] + self._act_places = [] + if use_cuda: + for i in six.moves.range(core.get_cuda_device_count()): + p = core.Place() + self._act_places.append(core.CUDAPlace(i)) + p.set_place(self._act_places[-1]) + self._places.append(p) + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in six.moves.range(cpu_num): + p = core.Place() + self._act_places.append(core.CPUPlace()) + p.set_place(self._act_places[-1]) + self._places.append(p) + assert self._places, "no place for execution" + + if exec_strategy is None: + exec_strategy = ExecutionStrategy() + exec_strategy.use_cuda = use_cuda + + if exec_strategy.num_threads == 0: + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num * 2 + + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + + if build_strategy is None: + build_strategy = BuildStrategy() + + main = main_program + main = main if main else framework.default_main_program() + if scope == None: + scope = executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + self.persistable_vars = [ + v.name for v in [ + var for var in main.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ] + + self.executor = core.ParallelExecutor( + self._places, + set([ + cpt.to_text(p.name) + for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + cpt.to_text(loss_name) + if loss_name else six.u(''), scope, local_scopes, exec_strategy, + build_strategy, num_trainers, trainer_id) + self.scope = scope + + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): """ - ParallelExecutor is designed for data parallelism, which focuses on distributing - the data across different nodes and every node operates on the data in parallel. - If you use ParallelExecutor to run the current program on GPU, the node means GPU - device, and ParallelExecutor will get the available GPU device automatically on - the current machine. If you use ParallelExecutor to run the current program on CPU, - the node means the CPU device, and you can specify the CPU device number by adding - 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable - is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number - of CPUs in the system. + Run a parallel executor with fetch_list. + + The feed parameter can be a dict or a list. If feed is a dict, the + feed data will be split into multiple devices. If feed is a list, we + assume the data has been splitted into multiple devices, the each + element in the list will be copied to each device directly. + + For example, if the feed is a dict: + + >>> exe = ParallelExecutor() + >>> # the image will be splitted into devices. If there is two devices + >>> # each device will process an image with shape (24, 1, 28, 28) + >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) + + For example, if the feed is a list: + + >>> exe = ParallelExecutor() + >>> # each device will process each element in the list. + >>> # the 1st device will process an image with shape (48, 1, 28, 28) + >>> # the 2nd device will process an image with shape (32, 1, 28, 28) + >>> # + >>> # you can use exe.device_count to get the device number. + >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, + >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, + >>> ]) Args: - use_cuda (bool): Whether to use CUDA or not. - loss_name (str): The loss name must set in training. Default None. - main_program (Program): The program that need to run, if not provided, - then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provide, it will share variables - from the specified ParallelExecutor. Default None. - exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run - the program in ParallelExecutor, for example how many threads are used to - execute the program, how many iterations to clean up the temp variables - which is generated during execution. For more information, please refer - to fluid.ExecutionStrategy. Default None. - build_strategy(BuildStrategy): build_strategy is used to control how to - build the SSA Graph in ParallelExecutor by setting the property, - for example reduce_strategy, gradient_scale_strategy. For more information, - please refer to fluid.BuildStrategy. Default None. - num_trainers(int): If greater than 1, NCCL will be initialized with - multiple rank of nodes, each node should have same number of GPUs. - Distributed training will be enabled then. Default 1. - trainer_id(int): Must use together with num_trainers. trainer_id is the - "rank" of current node starts from 0. Default 0. - scope(Scope): scope to run with, default use fluid.global_scope(). + fetch_list(list): The fetched variable names + feed(list|dict|None): The feed variables. If the feed is a dict, + tensors in that dict will be splitted into each devices. If + the feed is a list, each element of the list will be copied + to each device. Default None. + feed_dict: Alias for feed parameter, for backward compatibility. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. Returns: - ParallelExecutor: The initialized ParallelExecutor object. + List: The fetched result list. Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor object. + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. Examples: .. code-block:: python - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) """ - - def __init__(self, - use_cuda, - loss_name=None, - main_program=None, - share_vars_from=None, - exec_strategy=None, - build_strategy=None, - num_trainers=1, - trainer_id=0, - scope=None): - self._places = [] - self._act_places = [] - if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) - assert self._places, "no place for execution" - - if exec_strategy is None: - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_cuda - - if exec_strategy.num_threads == 0: - if use_cuda: - # Experiments on se-resnext shows that too many threads hurt - # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num * 2 - - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - - if build_strategy is None: - build_strategy = BuildStrategy() - - main = main_program - main = main if main else framework.default_main_program() - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ - var for var in main.list_vars() - if var.persistable and var.type != core.VarDesc.VarType.RAW - ] - ] - - self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) - for var in self.persistable_vars), main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) - self.scope = scope - - def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): - """ - Run a parallel executor with fetch_list. - - The feed parameter can be a dict or a list. If feed is a dict, the - feed data will be split into multiple devices. If feed is a list, we - assume the data has been splitted into multiple devices, the each - element in the list will be copied to each device directly. - - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) - - For example, if the feed is a list: - - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) - - Args: - fetch_list(list): The fetched variable names - feed(list|dict|None): The feed variables. If the feed is a dict, - tensors in that dict will be splitted into each devices. If - the feed is a list, each element of the list will be copied - to each device. Default None. - feed_dict: Alias for feed parameter, for backward compatibility. - This parameter has been deprecated. Default None. - return_numpy(bool): Whether converts the fetched tensor to numpy. - Default: True. - - Returns: - List: The fetched result list. - - Raises: - ValueError: If the feed is a list, but its length is not equal the - length of active places, or its element's is not dict. - - NOTES: - 1. If the feed's type is dict, the number of data that feeds to - ParallelExecutor must be bigger than active places. Otherwise, - it will throw exception from C++ side. Special attention should be - paid to check whether the last batch of the dataset is bigger - than active places. - 2. If active places are more than one, the fetch results for each - variable is a list, and each element of this list is the variable of - respective active place. - - Examples: - .. code-block:: python - - pe = fluid.ParallelExecutor(use_cuda=use_cuda, - loss_name=avg_cost.name, - main_program=fluid.default_main_program()) - loss = pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name])) - """ - if feed is None and feed_dict is not None: - feed = feed_dict - print( - "`feed_dict` is deprecated. Please use `feed=`", - file=sys.stderr) - - if isinstance(feed, dict): - feed_tensor_dict = dict() - for feed_name in feed: - feed_tensor = feed[feed_name] - if not isinstance(feed_tensor, core.LoDTensor): - feed_tensor = core.LoDTensor() - # always set to CPU place, since the tensor need to be splitted - # it is fast in CPU - feed_tensor.set(feed[feed_name], core.CPUPlace()) - feed_tensor_dict[feed_name] = feed_tensor - - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) - elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): - raise ValueError( - "Feed a list of tensor, the list should be the same size as places" - ) - - res = list() - - for i, each in enumerate(feed): - if not isinstance(each, dict): - raise TypeError( - "Each element of feed list should be a dict") - res_dict = dict() - for feed_name in each: - tensor = each[feed_name] - if not isinstance(tensor, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) - tensor = tmp - res_dict[feed_name] = tensor - res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) - - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) - arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - - if return_numpy: - return executor.as_numpy(arr) - - return [arr[i] for i in range(len(arr))] - - @property - def device_count(self): - return len(self._act_places) + if feed is None and feed_dict is not None: + feed = feed_dict + print( + "`feed_dict` is deprecated. Please use `feed=`", + file=sys.stderr) + + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._act_places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._act_places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + self.executor.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return executor.as_numpy(arr) + + return [arr[i] for i in range(len(arr))] + + @property + def device_count(self): + return len(self._act_places) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1513eca51439288acac35729300bcbe4e71e4205..29e4ca04a7fbb2eae870fcf15763310b849c8b53 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -45,6 +45,10 @@ if(APPLE) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) endif() +if(NOT WITH_MKLML) + # this op is not support on openblas + list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) +endif() function(py_test_modules TARGET_NAME) if(WITH_TESTING) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 690c4cf0ad6b2c741689e419223cfa6b6e1e5cf3..c195a28e452fbe073a9afb5d650f538176f688fd 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -362,7 +362,9 @@ class OpTest(unittest.TestCase): else: return [] places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): + cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False + if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\ + and not cpu_only: places.append(core.CUDAPlace(0)) return places diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..536e9a1c58ec4a8b1b5a7c1d3a5fe737b38d24ab --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py @@ -0,0 +1,263 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from test_elementwise_mul_op import * + + +class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp): + def init_input_output(self): + x = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + self.y = np.random.rand(1, 16).astype(self.dtype) + + self.out = x * self.y.reshape(1, 16, 1, 1) + self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_BroadcastNCHW16c, self).setUp() + self.attrs["x_data_format"] = "nchw16c" + self.attrs["y_data_format"] = "nc" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +@unittest.skip( + "Not implemented yet.") # TODO(mgallus): enable when implemented. +class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp): + def init_input_output(self): + x = np.random.rand(1, 8, 2, 2).astype(self.dtype) + self.x = x.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2) + self.y = np.random.rand(1, 8).astype(self.dtype) + + self.out = x * self.y.reshape(1, 8, 1, 1) + self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2) + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_BroadcastNCHW8c, self).setUp() + self.attrs["x_data_format"] = "nchw8c" + self.attrs["y_data_format"] = "nc" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +class TestElementwiseMulMKLDNNOp_FallbackNCHW(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.y = np.random.rand(1, 16).astype(self.dtype) + + self.out = self.x * self.y.reshape(1, 16, 1, 1) + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp): + def init_input_output(self): + x = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + y = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + + self.out = self.x * self.y + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp() + self.attrs["x_data_format"] = "nchw16c" + self.attrs["y_data_format"] = "nchw16c" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp): + def init_input_output(self): + x = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + y = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + + self.out = self.x * self.y + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp() + self.attrs["x_data_format"] = "nchw16c" + self.attrs["y_data_format"] = "nchw16c" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype) + y = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + + self.out = self.x * y + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_FallbackWithReorder1, self).setUp() + self.attrs["x_data_format"] = "nchw" + self.attrs["y_data_format"] = "nchw16c" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp): + def init_input_output(self): + self.y = np.random.rand(1, 16, 2, 2).astype(self.dtype) + x = np.random.rand(1, 16, 2, 2).astype(self.dtype) + self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) + + self.out = x * self.y + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_FallbackWithReorder2, self).setUp() + self.attrs["x_data_format"] = "nchw16c" + self.attrs["y_data_format"] = "nchw" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.rand(1, 16).astype(self.dtype) + self.y = np.random.rand(1, 16).astype(self.dtype) + + self.out = self.x * self.y + + def setUp(self): + super(TestElementwiseMulMKLDNNOp_FallbackNoReorders2, self).setUp() + self.attrs["x_data_format"] = "nc" + self.attrs["y_data_format"] = "nc" + self._cpu_only = True + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_axis(self): + self.axis = 0 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ingore_x(self): + pass + + def test_check_grad_ingore_y(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 53409e436c0739bce63a3a8f90591e0ca6836859..57ba34f833f824d13e0b82caea789f7f57622bc9 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -21,13 +21,24 @@ from paddle.fluid.op import Operator class ElementwiseMulOp(OpTest): + def init_kernel_type(self): + self.use_mkldnn = False + def setUp(self): self.op_type = "elementwise_mul" + self.dtype = np.float32 + self.axis = -1 + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + self.inputs = { - 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"), - 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64") + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } - self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])} + self.outputs = {'Out': self.out} + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} def test_check_output(self): self.check_output() @@ -41,6 +52,17 @@ class ElementwiseMulOp(OpTest): def test_check_grad_ingore_y(self): self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.multiply(self.x, self.y) + + def init_dtype(self): + pass + + def init_axis(self): + pass + class TestElementwiseMulOp_scalar(ElementwiseMulOp): def setUp(self): @@ -63,17 +85,13 @@ class TestElementwiseMulOp_Vector(ElementwiseMulOp): class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp): - def setUp(self): - self.op_type = "elementwise_mul" - self.inputs = { - 'X': np.random.rand(2, 3, 4).astype(np.float64), - 'Y': np.random.rand(2).astype(np.float64) - } + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(2).astype(self.dtype) + self.out = self.x * self.y.reshape(2, 1, 1) - self.attrs = {'axis': 0} - self.outputs = { - 'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1) - } + def init_axis(self): + self.axis = 0 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):