diff --git a/.gitignore b/.gitignore index 5018bf56c1633237b98d29a66eb86aed41fa6891..ce0cd3bc27b6225a8e6e24a8331022e6224603ac 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output paddle/infrt/tests/lit.cfg.py +paddle/fluid/pybind/eager_final_state_op_function_impl.h +paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e8321010d389ee2493ef35d74d5d75d3ea73bfe9..a4c1b9c8098e9e632a4a05c491e07b1ce051c945 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup ji option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_POCKETFFT "Compile with pocketfft support" ON) option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) +option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) if(WITH_RECORD_BUILDTIME) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") @@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr return() endif() +if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER) +set(WITH_CUSTOM_DEVICE ON) +endif() + if(WIN32) if(WITH_DISTRIBUTE) MESSAGE(WARNING diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 9ebde06bd01ab9968b9cc53a3e38a2b2e1684fc4..20a35c91bdde1d606cef2b46ad8aabb5952bd7d8 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -219,3 +219,7 @@ endif(ON_INFER) if(WITH_CRYPTO) add_definitions(-DPADDLE_WITH_CRYPTO) endif(WITH_CRYPTO) + +if(WITH_CUSTOM_DEVICE AND NOT WIN32) + add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) +endif() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index e9180c4fc9bb43cd2070e5bc93c74c7a9ee6510a..b099831738599ef4aaedd444d0a5d3721bd1aba8 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -55,6 +55,7 @@ IF(NOT WIN32) INSTALL_COMMAND make install NO_SHARED=1 NO_LAPACK=1 PREFIX= UPDATE_COMMAND "" CONFIGURE_COMMAND "" + BUILD_BYPRODUCTS ${CBLAS_LIBRARIES} ) ELSE(NOT WIN32) SET(CBLAS_LIBRARIES @@ -83,6 +84,8 @@ ELSE(NOT WIN32) CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + # ninja need to know where openblas.lib comes from + BUILD_BYPRODUCTS ${CBLAS_LIBRARIES} ) SET(OPENBLAS_SHARED_LIB ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}) ENDIF(NOT WIN32) diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 40b0a8b55e17a2eca26bb2c4d94221054724c530..941d470f87935f95abe5d599c9b7fa7a2730228b 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -53,7 +53,6 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, } else if (input_data.dtype == DistModelDataType::INT32) { input_tensor_ptr = input_tensor->mutable_data(dims, place); } else { - // Q(fleet exe dev): for input/output, should we support fp16 LOG(ERROR) << "unsupported feed type " << input_data.dtype; return false; } @@ -113,14 +112,6 @@ std::string DistModelDTypeToString(DistModelDataType dtype) { return "NOT SUPPORT DTYPE"; } -bool IsPPFirstStage(const DistModelConfig &config) { - return config.local_rank - config.mp_degree < 0; -} - -bool IsPPLastStage(const DistModelConfig &config) { - return config.local_rank + config.mp_degree >= config.nranks; -} - class DistModelTimer { public: void tic() { tic_time = std::chrono::high_resolution_clock::now(); } @@ -197,65 +188,34 @@ bool DistModel::PreparePlace() { } bool DistModel::CommInit() { - // NOTE (Yuang Liu): The peer endpoints will be obtained with the assumption - // that mp part is always on inner side and pp part is always on outer side. - // TODO(fleet exe dev): The peer endpoints could be configured by users. - PADDLE_ENFORCE_EQ( - config_.pp_degree * config_.mp_degree, config_.nranks, - platform::errors::InvalidArgument( - "The mp_degree multiplies pp_degree is not equal with nranks")); std::unique_ptr comm_init_program( new framework::ProgramDesc()); framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0); - if (config_.mp_degree > 1) { - PADDLE_ENFORCE_GE( - config_.mp_ring_id, 0, - platform::errors::InvalidArgument( - "mp ring id must be provided for inference under mp.")); - VLOG(3) << "Init comm group for mp."; + std::vector &ring_ids = + config_.rank_to_ring_ids_[config_.local_rank]; + int64_t order = 0; + std::string var_name_base = "comm_init_"; + for (int64_t ring_id : ring_ids) { + VLOG(3) << "Init comm for ring id: " << ring_id; + int64_t ranks_in_group = config_.ring_id_to_ranks_[ring_id].size(); + int64_t rank_in_group = 0; + std::vector &ranks = config_.ring_id_to_ranks_[ring_id]; + for (int64_t rank : ranks) { + if (config_.local_rank == rank) { + break; + } + rank_in_group += 1; + } std::vector peer_endpoints; - for (int64_t - idx = (config_.local_rank / config_.mp_degree) * config_.mp_degree, - i = 0; - i < config_.mp_degree; ++idx, ++i) { - if (config_.trainer_endpoints[idx] == config_.current_endpoint) { + for (int64_t rank : ranks) { + if (config_.local_rank == rank) { continue; } - peer_endpoints.emplace_back(config_.trainer_endpoints[idx]); - } - // get nranks in a mp group and inner group rank for local rank - int64_t mp_group_nranks = config_.nranks / config_.pp_degree; - int64_t mp_group_rank = config_.local_rank % config_.mp_degree; - InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints, - comm_init_block, config_.mp_ring_id); - } - if (config_.pp_degree > 1) { - VLOG(3) << "Init comm group for pp."; - if (!IsPPFirstStage(config_)) { - PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true, - platform::errors::InvalidArgument( - "pp upstream ring id must be provided for " - "non-first pp stage if inference under pp.")); - // not the first pp stage, has upstream - std::vector upstream_peer_endpoints; - upstream_peer_endpoints.emplace_back( - config_.trainer_endpoints[config_.local_rank - config_.mp_degree]); - InsertCommOp("pp_upstream_comm_id", 2, 1, upstream_peer_endpoints, - comm_init_block, config_.pp_upstream_ring_id); - } - - if (!IsPPLastStage(config_)) { - PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true, - platform::errors::InvalidArgument( - "pp downstream ring id must be provided for " - "non-last pp stage if inference under pp.")); - // not the last pp stage, has downstream - std::vector downstream_peer_endpoints; - downstream_peer_endpoints.emplace_back( - config_.trainer_endpoints[config_.local_rank + config_.mp_degree]); - InsertCommOp("pp_downstream_comm_id", 2, 0, downstream_peer_endpoints, - comm_init_block, config_.pp_downstream_ring_id); + peer_endpoints.emplace_back(config_.trainer_endpoints[rank]); } + InsertCommOp(var_name_base + std::to_string(order), ranks_in_group, + rank_in_group, peer_endpoints, comm_init_block, ring_id); + order += 1; } framework::NaiveExecutor e(place_); e.CreateVariables(*comm_init_program, 0, true, scope_.get()); @@ -409,12 +369,7 @@ bool DistModel::LoadParameters() { bool DistModel::PrepareFleetExe() { task_node_.reset(new TaskNode(program_.get(), config_.local_rank)); - if (config_.local_rank - config_.mp_degree >= 0) { - task_node_->AddUpstreamTask(config_.local_rank - config_.mp_degree); - } - if (config_.local_rank + config_.mp_degree < config_.nranks) { - task_node_->AddDownstreamTask(config_.local_rank + config_.mp_degree); - } + // With auto cut, there is no concept of pp, no need to add dependency. task_node_->SetType("Compute"); task_node_->Init(); executor_desc_ = FleetExecutorDesc(); @@ -473,40 +428,13 @@ bool DistModel::PrepareFeedAndFetch() { } } - if (config_.pp_degree == 1) { - if (feeds_.size() == 0) { - LOG(ERROR) << "No feed ops in the inf program, please check the program."; - return false; - } - if (fetches_.size() == 0) { - LOG(ERROR) << "No fetch op in the inf program, please check the program."; - return false; - } - } else { - if (IsPPFirstStage(config_)) { - if (feeds_.size() == 0) { - LOG(ERROR) << "Feed ops are needed for the first pp stage."; - return false; - } - } else { - if (feeds_.size() > 0) { - LOG(WARNING) << "Feed op is found in the non-first stage of pp."; - } else { - LOG(INFO) << "No feed ops in non-first pp stage."; - } - } - if (IsPPLastStage(config_)) { - if (fetches_.size() == 0) { - LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure " - "the result has been sent to frist pp stage."; - } - } else { - if (fetches_.size() > 0) { - LOG(WARNING) << "Fetch op is found in the non-last stage of pp."; - } else { - LOG(INFO) << "No fetch op in non-last pp stage."; - } - } + if (feeds_.size() == 0) { + LOG(ERROR) << "No feed ops in the inf program, please check the program."; + return false; + } + if (fetches_.size() == 0) { + LOG(ERROR) << "No fetch op in the inf program, please check the program."; + return false; } return true; } @@ -606,7 +534,6 @@ bool DistModel::FetchResult(const framework::LoDTensor &fetch, bool DistModel::Run(const std::vector &input_data, std::vector *output_data) { - // TODO(fleet exe dev): support pipeline inf mode VLOG(3) << "DistModel run for once."; DistModelTimer timer; diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h index c980178b67c5244e751a8e89b945f353110a7456..d0203c131357c749b7df20a345982d2ddd025783 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -47,12 +48,9 @@ struct DistModelConfig { std::string current_endpoint{}; int64_t nranks{1}; int64_t local_rank{0}; - int64_t mp_degree{1}; - int64_t pp_degree{1}; - int64_t mp_ring_id{-1}; - int64_t pp_upstream_ring_id{-1}; - int64_t pp_downstream_ring_id{-1}; bool enable_timer{false}; + std::map> ring_id_to_ranks_{}; + std::map> rank_to_ring_ids_{}; }; class DistModel { diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 07d2a0f6b727aa56ef804e5ca9dee8e7a86e2cdb..643ef52e87bdaff0d531a68922077a8877830a9f 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(fill_constant); namespace paddle { diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index ab3b33d411c0e09f37885491e93144a2577d5c40..5dc8709679e25a48f2aa047b0404092ac8c1dc66 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1227,11 +1227,11 @@ static std::pair GenerateForwardFunctionContents( // Forward Function Body // According to fwd_inputs_name_pos_map - std::map>> + std::map>> ins = { {"X" , TrySyncToVars(X)}, { "Y" , TrySyncToVars(Y)} }; - std::map>> + std::map>> outs = { {"Out0" , CreateVars(Out0Num)}, {"Out1" @@ -1316,7 +1316,7 @@ static std::pair GenerateForwardFunctionContents( const char* FWD_INS_MAP_TEMPLATE = " std::map>> ins = { " + "std::vector>> ins = { " "%s };\n"; std::string ins_map_str = paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str); @@ -1353,8 +1353,9 @@ static std::pair GenerateForwardFunctionContents( if (op_passing_outs_map[op_type].count(output_name)) { const std::string output_var_name = output_name + "Var"; - // Pass Output from function argument(EagerTensor*/vector&), - // in form of shared_ptr/vector> + // Pass Output from function + // argument(EagerVariable*/vector&), + // in form of shared_ptr/vector> if (output.duplicable()) { const char* FWD_NUM_ARG_TEMPLATE = ", std::vector& %s"; @@ -1395,7 +1396,7 @@ static std::pair GenerateForwardFunctionContents( } else { const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", " - "{std::make_shared(egr::Controller::Instance()." + "{std::make_shared(egr::Controller::Instance()." "GenerateUniqueName())}},"; outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name); @@ -1407,7 +1408,7 @@ static std::pair GenerateForwardFunctionContents( const char* FWD_OUTS_MAP_TEMPLATE = " std::map>> outs = { " + "std::vector>> outs = { " "%s };\n"; std::string outs_map_str = paddle::string::Sprintf(FWD_OUTS_MAP_TEMPLATE, outs_contents_str); @@ -1482,7 +1483,7 @@ static std::pair GenerateForwardFunctionContents( generated_function_body += out_tensor_str; } generated_function_body += "\n"; - VLOG(6) << "Converted Output VarBase to EagerTensor(s)"; + VLOG(6) << "Converted Output VarBase to EagerVariable(s)"; // [Generation] Handle core_ops_returns_info core_ops_returns_info[op_type] = return_contents; @@ -1627,7 +1628,7 @@ static std::string GenerateSingleOpBase( const char* BWD_INS_MAP_TEMPLATE = " std::map>> %s = { " + "std::vector>> %s = { " "%s };\n"; std::string ins_map_str = paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str); @@ -1704,7 +1705,7 @@ static std::string GenerateSingleOpBase( } else { const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", " - "{std::make_shared(egr::Controller::Instance(" + "{std::make_shared(egr::Controller::Instance(" ")." "GenerateUniqueName())}},"; outs_contents_str += paddle::string::Sprintf( @@ -1723,7 +1724,7 @@ static std::string GenerateSingleOpBase( const char* BWD_OUTS_MAP_TEMPLATE = " std::map>> %s = { " + "std::vector>> %s = { " "%s };\n"; std::string outs_map_str = paddle::string::Sprintf( BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str); diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h index 2326ab012e3caef34b6b70950dcc1088111ab9e5..19ce457df60cba5e1a1a044f0c7f43a7cbda06d9 100644 --- a/paddle/fluid/eager/eager_tensor.h +++ b/paddle/fluid/eager/eager_tensor.h @@ -40,36 +40,28 @@ * **/ namespace egr { -class EagerTensor final { +class EagerVariable final { public: /* Default constructor and name constructor should only be used for contruct * output and in fluid*/ - EagerTensor() = default; + EagerVariable() = default; - explicit EagerTensor(const std::string& name) : name_(name) {} + explicit EagerVariable(const std::string& name) : name_(name) {} - explicit EagerTensor(const paddle::experimental::Tensor& tensor) + explicit EagerVariable(const paddle::experimental::Tensor& tensor) : name_(tensor.name()) { if (tensor.defined()) { if (tensor.is_dense_tensor()) { - auto* framework_tensor = - var_.GetMutable(); - // Contruct framework::Tensor from egr::EagerTensor - auto tensor_dense = - std::dynamic_pointer_cast(tensor.impl()); - PADDLE_ENFORCE_EQ((tensor_dense.get() && tensor_dense), true, - paddle::platform::errors::Fatal( - "Failed to Trans Tensor to EagerVariable since " - "we got Tensor with type DenseTensor, and we got " - "EagerVariable with another type.")); - *framework_tensor = *tensor_dense; + ConstructVariableFromTensor(tensor); + } else if (tensor.is_selected_rows()) { + ConstructVariableFromSelectedRows(tensor); } else { PADDLE_THROW(paddle::platform::errors::Fatal( "Unrecognized egr::EagerVariable type, only " - "DenseTensor and SelectedRows is supported for now.")); + "DenseTensor and SelectedRows are supported for now.")); } } else { - VLOG(6) << "Build Empty EagerTensor with name " << name_; + VLOG(6) << "Build Empty EagerVariable with name " << name_; } } @@ -77,21 +69,20 @@ class EagerTensor final { std::shared_ptr GetTensorBase() { // Construct allocation only once. if (var_.IsInitialized()) { - if (var_.IsType()) { - return SetImplWithLegacyTensor(); - } else if (var_.IsType()) { - return SetImplWithLegacyTensor(); + if (var_.IsType() || + var_.IsType()) { + return SetImplWithLegacyTensor(); } else if (var_.IsType()) { - return SetImplWithSelectedRows(); + return SetImplWithLegacySelectedRows(); } else { PADDLE_THROW(paddle::platform::errors::Fatal( "Unable to fetch underlying tensor " - "from EagerTensor, only LoDTensor and " + "from EagerVariable, only LoDTensor and " "Tensor are supported for now")); } } else { PADDLE_THROW(paddle::platform::errors::Fatal( - "Can not Sync EagerTensor %s whose paddle::framework::Variable is " + "Can not Sync EagerVariable %s whose paddle::framework::Variable is " "not initialized!", name())); } @@ -107,23 +98,52 @@ class EagerTensor final { void set_name(const std::string& name) { name_ = name; } private: - template std::shared_ptr SetImplWithLegacyTensor() { - const auto& framework_tensor = var_.Get(); + const auto& framework_tensor = var_.Get(); VLOG(8) << "Sync Var to tensor for: " << name(); - return std::make_shared(std::move(framework_tensor)); + return std::make_shared(framework_tensor); } - std::shared_ptr SetImplWithSelectedRows() { - auto* selected_rows = var_.GetMutable(); - auto res = std::make_shared(selected_rows->rows_, - selected_rows->height_); - res->value_.reset(selected_rows->value_.release()); - res->id_to_index_ = std::move(selected_rows->id_to_index_); - res->rwlock_.reset(selected_rows->rwlock_.release()); + std::shared_ptr SetImplWithLegacySelectedRows() { + auto* framework_tensor = var_.GetMutable(); + VLOG(8) << "Sync SelectedRows to tensor for: " << name(); + auto res = + std::make_shared(std::move(*framework_tensor)); + var_.Clear(); return res; } + void ConstructVariableFromTensor(const paddle::experimental::Tensor& tensor) { + auto* framework_tensor = var_.GetMutable(); + // Contruct framework::Tensor from egr::EagerVariable + auto tensor_dense = + std::dynamic_pointer_cast(tensor.impl()); + PADDLE_ENFORCE_EQ( + (tensor_dense.get() && tensor_dense), true, + paddle::platform::errors::Fatal( + "Tensor %s does not hold pten::SelectedRows or pten::DenseTensor. " + "Or it holds empty impl, this should not happend since we should " + "treat all kinds of tensor as what they are.", + tensor.name())); + *framework_tensor = *tensor_dense; + } + + void ConstructVariableFromSelectedRows( + const paddle::experimental::Tensor& tensor) { + auto* framework_tensor = var_.GetMutable(); + // Contruct framework::Tensor from egr::EagerVariable + auto tensor_dense = + std::dynamic_pointer_cast(tensor.impl()); + PADDLE_ENFORCE_EQ( + (tensor_dense.get() && tensor_dense), true, + paddle::platform::errors::Fatal( + "Tensor %s does not hold pten::SelectedRows or pten::DenseTensor. " + "Or it holds empty impl, this should not happend since we should " + "treat all kinds of tensor as what they are.", + tensor.name())); + *framework_tensor = std::move(*tensor_dense); + } + private: std::string name_{""}; paddle::framework::Variable var_; diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 56fdf542bbb93ec28c0dc21bacf38eedb3968bd0..6a8720c1cc27de41a91b40c29ae9d08b99ccb09e 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -78,9 +78,9 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, if (buffer_tensor.is_dense_tensor()) { paddle::imperative::SelectedRowsAddToTensor(t, &buffer_tensor); } else { - PADDLE_THROW(paddle::platform::errors::Fatal( - "We don't support Selected Rows merge for now, support it later " - "and make all kinds of grads can be merged.")); + buffer_tensor = + std::move(*paddle::imperative::SelectedRowsMerge< + paddle::experimental::Tensor>(t, buffer_tensor)); } } } diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index c27d1871e398164ad976c73919499ceed3938057..e3bb53106776604d1c2fee0a53fc6d87a9d83755 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -115,7 +115,7 @@ TEST(Tensor, MemberFunction) { CHECK_EQ(tmp_autograd_meta_test->val_, 2); } -TEST(EagerTensor, Constructor) { +TEST(EagerVariable, Constructor) { paddle::experimental::Tensor t3; pten::DenseTensorMeta meta = pten::DenseTensorMeta( pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2})); @@ -134,7 +134,7 @@ TEST(EagerTensor, Constructor) { CHECK_EQ(t3.defined(), false); t3.set_impl(dt); - egr::EagerTensor et3 = egr::EagerTensor(t3); + egr::EagerVariable et3 = egr::EagerVariable(t3); VLOG(6) << "SyncToVar"; CHECK_EQ(et3.Var().Get().data()[0], 5.0f); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index b771ff28d8ee2d762f5bca717942d4a57c155984..734a611d07b57b6e8e31933cf2683e60efff487a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/selected_rows.h" #include "paddle/pten/core/kernel_registry.h" @@ -102,3 +103,69 @@ TEST(GradTensorHolder, Interfaces) { CHECK_EQ(holder_et0_ptr[0], 1.0f); CHECK_EQ(holder_et1_ptr[0], 30.0f); } + +TEST(GradTensorHolder, SelectedRowsMergeAdd) { + pten::CPUPlace cpu; + + std::vector rows{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + int64_t table_size = 10; + int64_t embedding_width = 10; + + auto sr1 = std::make_shared(rows, table_size); + auto sr2 = std::make_shared(rows, table_size); + + // initialize a sparse table 1 + sr1->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr1 = sr1->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr1[i * embedding_width + j] = static_cast(i); + } + } + + // initialize a sparse table 2 + sr2->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr2 = sr2->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr2[i * embedding_width + j] = static_cast(i); + } + } + // new 2 pten::Tensor + paddle::experimental::Tensor t1(sr1); + paddle::experimental::Tensor t2(sr2); + + // Constructor empty GradTensorHolder + GradSlotMeta slot_meta; + slot_meta.Init(1); + GradTensorHolder grad_tensor_holder = + GradTensorHolder({slot_meta, slot_meta}); + + // accumulation + grad_tensor_holder.add(0, 0, t1, false); + grad_tensor_holder.add(0, 0, t2, false); + + // Buffers() + const auto& buffers = grad_tensor_holder.Buffers(); + CHECK_EQ(static_cast(buffers.size()), 2); + CHECK_EQ(static_cast(buffers[0].size()), 1); + CHECK_EQ(static_cast(buffers[1].size()), 1); + + // operator[] + const auto& holder_et0 = grad_tensor_holder[0][0]; + + auto* tmp_buffer_tensor = + static_cast(holder_et0.impl().get()); + auto* tmp_buffer_data_sr = + tmp_buffer_tensor->mutable_value()->mutable_data(cpu); + + // verify the MergeAdd result (accumulation result) + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + EXPECT_EQ(tmp_buffer_data_sr[i * embedding_width + j], + (static_cast(i) + static_cast(i))); + } + } +} diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 176a02d896384f90226eb196436a9a41670852a7..8aa6b7b8460749911a9f7187564aa1195006b537 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -176,6 +176,6 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { } USE_OP_ITSELF(scale); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index d2bc05f41b532238c688960087dba6ce1281331f..53d97b2919a5bf6b1a7b0c99b3ed46b5f70b27ef 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -189,6 +189,6 @@ USE_OP_ITSELF(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index c2f0479460064e05fc917ec432a7384e43e73cf3..0b2585905d3eda09b2565812f918949ed7f2ffba 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -212,6 +212,6 @@ TEST(Benchmark, FluidMLPCPU) { } // namespace paddle USE_OP_ITSELF(scale); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index 250005e31150c3c9d83d3d094ccb4e00b2de7429..9cebb73a34a7ff6541a499bdd4f36997034f4bf1 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -249,6 +249,6 @@ USE_OP_ITSELF(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc index c11bd94ee9369f983684be38fbb811d87968791a..db3d2cf519c6ddc892e0502dfcee6914d3e594a8 100644 --- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc @@ -16,6 +16,7 @@ #include "gtest/gtest.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" @@ -167,7 +168,7 @@ TEST(EagerUtils, PassStopGradient) { TEST(EagerUtils, TrySyncToVar) { paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); auto tensor = CreateTestCPUTensor(5.0f, ddim); - std::vector> var_bases = { + std::vector> var_bases = { egr::EagerUtils::TrySyncToVar(tensor)}; paddle::framework::Variable* var = var_bases[0]->MutableVar(); @@ -187,7 +188,7 @@ TEST(EagerUtils, TrySyncToVars) { std::vector tensors = { CreateTestCPUTensor(1.0f, ddim), CreateTestCPUTensor(2.0f, ddim)}; - std::vector> var_bases = + std::vector> var_bases = egr::EagerUtils::TrySyncToVars(tensors); { @@ -218,10 +219,32 @@ TEST(EagerUtils, TrySyncToVars) { TEST(EagerUtils, CreateVars) { VLOG(6) << "Check CreateVars"; - std::vector> outs = + std::vector> outs = egr::EagerUtils::CreateVars(2); CHECK_EQ(outs.size(), size_t(2)); CHECK(outs[0]->Var().IsInitialized() == false); } +TEST(EagerUtils, GetGradAccumulationNode) { + VLOG(6) << "Check GetGradAccumulationNode"; + paddle::experimental::Tensor t0("test_tensor"); + ASSERT_EQ(egr::EagerUtils::GetGradAccumulationNode(t0), nullptr); + auto autograd_ptr0 = egr::EagerUtils::autograd_meta(&t0); + autograd_ptr0->SetStopGradient(true); + ASSERT_EQ(egr::EagerUtils::GetGradAccumulationNode(t0), nullptr); + autograd_ptr0->SetStopGradient(false); + auto res = std::dynamic_pointer_cast( + egr::EagerUtils::GetGradAccumulationNode(t0)); + ASSERT_TRUE(res != nullptr); + auto res2 = egr::EagerUtils::GetGradAccumulationNode(t0); + ASSERT_EQ(res2.get(), res.get()); + autograd_ptr0->SetStopGradient(true); + auto res3 = egr::EagerUtils::GetGradAccumulationNode(t0); + ASSERT_EQ(res3, nullptr); + autograd_ptr0->SetStopGradient(false); + autograd_ptr0->SetGradNode( + std::make_shared(1, 2.0, 3)); + ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0)); +} + } // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 5b95b43edea82b8beac9c46fe81651784f608274..e3bdba05e97365fb177e6130d5ceaab9f7838529 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -123,5 +123,5 @@ TEST(Generated, ElementwiseAdd) { } // namespace egr USE_OP(sigmoid); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(matmul_v2); diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 7be70ff957565b2246e0e0fd8636816633f7e5c8..a8c27e86b877ae7483e3c52c87d19308b9a48907 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -21,6 +21,7 @@ #include "paddle/pten/common/layout.h" #include "paddle/pten/core/tensor_meta.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/variable.h" @@ -131,17 +132,17 @@ void EagerUtils::SetOutRankWithSlot(AutogradMeta* target, size_t slot_id) { target->SetSingleOutRankWithSlot(slot_id, 0); } -std::shared_ptr EagerUtils::TrySyncToVar( +std::shared_ptr EagerUtils::TrySyncToVar( const paddle::experimental::Tensor& tensor) { - return std::make_shared(tensor); + return std::make_shared(tensor); } -std::vector> EagerUtils::TrySyncToVars( +std::vector> EagerUtils::TrySyncToVars( const paddle::experimental::Tensor& tensor) { return {TrySyncToVar(tensor)}; } -std::vector> EagerUtils::TrySyncToVars( +std::vector> EagerUtils::TrySyncToVars( paddle::experimental::Tensor* tensor) { PADDLE_ENFORCE_NOT_NULL( tensor, @@ -151,9 +152,9 @@ std::vector> EagerUtils::TrySyncToVars( return {TrySyncToVar(*tensor)}; } -std::vector> EagerUtils::TrySyncToVars( +std::vector> EagerUtils::TrySyncToVars( const std::vector& tensors) { - std::vector> res; + std::vector> res; size_t num = tensors.size(); res.reserve(num); for (size_t i = 0; i < num; i++) { @@ -169,9 +170,9 @@ std::vector> EagerUtils::TrySyncToVars( return res; } -std::vector> EagerUtils::TrySyncToVars( +std::vector> EagerUtils::TrySyncToVars( const std::vector& tensors) { - std::vector> res; + std::vector> res; size_t num = tensors.size(); res.reserve(num); for (size_t i = 0; i < num; i++) { @@ -180,19 +181,19 @@ std::vector> EagerUtils::TrySyncToVars( return res; } -std::vector> EagerUtils::CreateVars( +std::vector> EagerUtils::CreateVars( const size_t num) { - std::vector> res; + std::vector> res; res.reserve(num); for (size_t i = 0; i < num; i++) { res.emplace_back( - new EagerTensor(egr::Controller::Instance().GenerateUniqueName())); + new EagerVariable(egr::Controller::Instance().GenerateUniqueName())); } return res; } std::vector EagerUtils::GetOutputs( - const std::vector>& outs) { + const std::vector>& outs) { std::vector res; res.reserve(outs.size()); for (const auto& out : outs) { @@ -209,7 +210,7 @@ std::vector EagerUtils::GetOutputs( } paddle::experimental::Tensor EagerUtils::GetOutput( - const std::shared_ptr& out) { + const std::shared_ptr& out) { PADDLE_ENFORCE_NOT_NULL( out.get(), paddle::platform::errors::Fatal( "Eager Tensor %s is null and cannot be copied. We " @@ -219,7 +220,7 @@ paddle::experimental::Tensor EagerUtils::GetOutput( return paddle::experimental::Tensor(out->GetTensorBase(), out->name()); } -void EagerUtils::OverwriteOutputs(const std::shared_ptr& out, +void EagerUtils::OverwriteOutputs(const std::shared_ptr& out, paddle::experimental::Tensor* tensor) { PADDLE_ENFORCE_NOT_NULL( tensor, paddle::platform::errors::Fatal( @@ -231,7 +232,7 @@ void EagerUtils::OverwriteOutputs(const std::shared_ptr& out, } void EagerUtils::OverwriteOutputs( - const std::vector>& outs, + const std::vector>& outs, const std::vector& tensors) { PADDLE_ENFORCE_EQ( outs.size(), tensors.size(), @@ -303,4 +304,41 @@ void EagerUtils::CheckAndRetainGrad( } } +std::shared_ptr EagerUtils::GetGradAccumulationNode( + const paddle::experimental::Tensor& tensor) { + auto* autograd_ptr = nullable_autograd_meta(tensor); + if (!autograd_ptr) { + return nullptr; + } + auto node_ptr = autograd_ptr->GetMutableGradNode(); + if (node_ptr && node_ptr.get()) { + if (!autograd_ptr->StopGradient()) { + auto accumulation_ptr = + std::dynamic_pointer_cast(node_ptr); + if (accumulation_ptr) { + return accumulation_ptr; + } else { + // Current GradNode is not a egr::GradNodeAccumulation + PADDLE_THROW(paddle::platform::errors::Fatal( + "GetGradAccumulationNode should only be called on leaf tensor, but " + "target tensor: %s has GradNode which is not a " + "GradNodeAccumulation, and this should not happend unless target " + "tensor is modified by some ops and calling set history for it.", + tensor.name())); + } + } else { + // Current Tensor does not have grad since it's stop_gradient is true; + return nullptr; + } + } else { + if (!autograd_ptr->StopGradient()) { + VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name(); + autograd_ptr->SetGradNode(std::make_shared()); + return autograd_ptr->GetMutableGradNode(); + } else { + return nullptr; + } + } +} + } // namespace egr diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index b0549488efc8f2e85d5550251bfffc9dac3a1af7..11c728e4c6c9bdd3e3ee60fb474200ff5ae20afc 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -88,7 +88,7 @@ class EagerUtils { /** * We have to use autograd_meta and multi_autograd_meta to initialize * autograd_meta for tensor, since we can't init it in - * egr::EagerTensor's + * egr::EagerVariable's * constructor (it's abstract class there) * * **/ @@ -151,34 +151,35 @@ class EagerUtils { // Intermidate needed remove this once we don't need legacy // Inner Method - static std::shared_ptr TrySyncToVar( + static std::shared_ptr TrySyncToVar( const paddle::experimental::Tensor& tensor); // Basic Input - static std::vector> TrySyncToVars( + static std::vector> TrySyncToVars( const paddle::experimental::Tensor& tensor); // Basic Output - static std::vector> TrySyncToVars( + static std::vector> TrySyncToVars( paddle::experimental::Tensor* tensor); // Multi Output - static std::vector> TrySyncToVars( + static std::vector> TrySyncToVars( const std::vector& tensors); // Multi Input - static std::vector> TrySyncToVars( + static std::vector> TrySyncToVars( const std::vector& tensors); // Construct empty output - static std::vector> CreateVars(const size_t num); + static std::vector> CreateVars( + const size_t num); // Construct Tensor From var static std::vector GetOutputs( - const std::vector>& outs); + const std::vector>& outs); static paddle::experimental::Tensor GetOutput( - const std::shared_ptr& out); + const std::shared_ptr& out); // Sync Back to origin output Tensor - static void OverwriteOutputs(const std::shared_ptr& out, + static void OverwriteOutputs(const std::shared_ptr& out, paddle::experimental::Tensor* tensor); static void OverwriteOutputs(const paddle::experimental::Tensor& out, paddle::experimental::Tensor* tensor); static void OverwriteOutputs( - const std::vector>& outs, + const std::vector>& outs, const std::vector& tensors); static void OverwriteOutputs( const std::vector& outs, @@ -188,6 +189,8 @@ class EagerUtils { static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor); static void CheckAndRetainGrad( const std::vector& tensors); + static std::shared_ptr GetGradAccumulationNode( + const paddle::experimental::Tensor& tensor); }; } // namespace egr diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a3f0ed392646c370e731f2d2f573f3dde348a5c9..78f5bb077aaf189ff0d21aba853d62aebe46f53e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -413,7 +413,7 @@ cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tens cc_library(generator SRCS generator.cc DEPS enforce place) cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference) - +cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor) # Get the current working branch execute_process( @@ -458,4 +458,5 @@ if(WITH_GPU OR WITH_ROCM) else() cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place) endif() +cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils) cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor) diff --git a/paddle/pten/tests/core/test_convert_utils.cc b/paddle/fluid/framework/convert_utils_test.cc similarity index 100% rename from paddle/pten/tests/core/test_convert_utils.cc rename to paddle/fluid/framework/convert_utils_test.cc index 977e49aafb9bd4e84e6626e1f3bbe16a30ef4c52..d547070e6d1f092f5a65ccfef6d743de6e6331e2 100644 --- a/paddle/pten/tests/core/test_convert_utils.cc +++ b/paddle/fluid/framework/convert_utils_test.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "gtest/gtest.h" #include "paddle/fluid/framework/convert_utils.h" +#include "gtest/gtest.h" namespace pten { namespace tests { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 24f1591ff33c965b9b787c05ff5db67ad4362ea4..20d08ef18aeb3e4d8a9f5cfd0b38954daf27020d 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { platform::errors::Unimplemented("platform::MLUPlace is not supported")); } + inline ::DLDevice operator()(const platform::CustomPlace &place) const { + PADDLE_THROW(platform::errors::Unimplemented( + "platform::CustomPlace is not supported")); + } + inline ::DLDevice operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLDevice device; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5596aba52131b74785741e16f9dc6ef71e6a91cb..4e6a4d5360860e8971c6dc9c2842defabcffd0dd 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle")); +#endif + } else if (platform::is_custom_place(place_)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for " << place_ << "."; + gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_, + max_memory_size)); + } else { + VLOG(4) << "Use default stream gc for " << place_ << "."; + gc.reset( + new CustomDefaultStreamGarbageCollector(place_, max_memory_size)); + } +#else + PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found")); #endif } } diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 22f77be85055578f0d4e8288e90001fb59e9628d..9f2bdeffecf62764f5cbe5bea9cb50d4830be43b 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -18,6 +18,7 @@ #endif #include "gflags/gflags.h" #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/platform/device/device_wrapper.h" DECLARE_double(eager_delete_tensor_gb); DECLARE_double(memory_fraction_of_eager_deletion); @@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback( } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector( + const platform::CustomPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CustomDefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void CustomDefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector( + const platform::CustomPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CustomDeviceUnsafeFastGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +CustomStreamGarbageCollector::CustomStreamGarbageCollector( + const platform::CustomPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::DeviceGuard guard(place); + stream_.reset(new platform::stream::Stream); + stream_->Init(place); + callback_manager_.reset(new platform::CallbackManager(stream_.get())); +} + +CustomStreamGarbageCollector::~CustomStreamGarbageCollector() { + platform::DeviceGuard guard(this->dev_ctx_->GetPlace()); + stream_->Synchronize(); + stream_->Destroy(); +} + +platform::stream::Stream *CustomStreamGarbageCollector::stream() const { + return stream_.get(); +} + +void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void CustomStreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif + int64_t GetEagerDeletionThreshold() { return FLAGS_eager_delete_tensor_gb < 0 ? -1 diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index f5d79d864b5659ed2b16cdded7e471eca457e3c5..a67860c6087e0f173e09d2a7c131703260c562fd 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class CustomDefaultStreamGarbageCollector : public GarbageCollector { + public: + CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector { + public: + CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class CustomStreamGarbageCollector : public GarbageCollector { + public: + CustomStreamGarbageCollector(const platform::CustomPlace &place, + size_t max_memory_size); + + ~CustomStreamGarbageCollector(); + + void Wait() const override; + + platform::stream::Stream *stream() const; + + protected: + void ClearCallback(const std::function &callback) override; + + private: + std::unique_ptr stream_; + std::unique_ptr callback_manager_; +}; +#endif + template void GarbageCollector::Add(Container &&objs) { Add(std::forward(objs), []() {}); diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 9e1958973d2d97a351ef5ced57339fb698b70281..bc0344d405cf795bc96fd3fb2d5376bbde89bd2b 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" +#include + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/pten_utils.h" @@ -303,13 +305,45 @@ pten::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, auto& attr = attr_reader.GetAttr(attr_name); if (std::type_index(attr.type()) == std::type_index(typeid(bool))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) { + infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int64_t))) { + infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); } else if (std::type_index(attr.type()) == std::type_index(typeid(float))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::string))) { + infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); } else { - // do nothing, skip useless attrs now - // TODO(chenweihang): support other attr type later and throw error - // if attr is cannot parsed + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported attribute type is received when call " + "InferShapeFunctor.")); } } else { // do nothing diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..755ca3f5ce90b7bcc85e904089262fd7f7e401cb --- /dev/null +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -0,0 +1,163 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/pten/core/compat/op_utils.h" +#include "paddle/pten/core/infermeta_utils.h" + +namespace paddle { +namespace framework { + +void TestInferMeta(bool bool_attr, int int_attr, int64_t int64_attr, + float float_attr, const std::string& str_attr, + const std::vector& vec_bool_attr, + const std::vector& vec_int_attr, + const std::vector& vec_int64_attr, + const std::vector& vec_float_attr, + const std::vector& vec_double_attr, + const std::vector& vec_str_attr) { + ASSERT_EQ(bool_attr, true); + ASSERT_EQ(int_attr, 10); + ASSERT_EQ(int64_attr, 100); + ASSERT_NEAR(float_attr, 3.14, 1e-6); + ASSERT_EQ(str_attr, "test"); + ASSERT_EQ(vec_bool_attr.at(0), true); + ASSERT_EQ(vec_bool_attr.at(1), true); + ASSERT_EQ(vec_int_attr.at(0), 10); + ASSERT_EQ(vec_int_attr.at(1), 10); + ASSERT_EQ(vec_int64_attr.at(0), 100L); + ASSERT_EQ(vec_int64_attr.at(1), 100L); + ASSERT_NEAR(vec_float_attr.at(0), 3.14, 1e-6); + ASSERT_NEAR(vec_float_attr.at(1), 3.14, 1e-6); + ASSERT_NEAR(vec_double_attr.at(0), 3.1415, 1e-6); + ASSERT_NEAR(vec_double_attr.at(1), 3.1415, 1e-6); + ASSERT_EQ(vec_str_attr.at(0), "test_vec"); + ASSERT_EQ(vec_str_attr.at(1), "test_vec"); +} + +class InferShapeUtilsTestOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddAttr("bool", "bool attr of test op"); + AddAttr("int", "int attr of test op"); + AddAttr("int64", "int64 attr of test op"); + AddAttr("float", "float attr of test op"); + AddAttr("string", "string attr of test op"); + AddAttr>("vec_bool", "vec_bool attr of test op"); + AddAttr>("vec_int", "vec_int attr of test op"); + AddAttr>("vec_int64", "vec_int attr of test op"); + AddAttr>("vec_float", "vec_int attr of test op"); + AddAttr>("vec_double", "vec_int attr of test op"); + AddAttr>("vec_str", "vec_int attr of test op"); + AddComment("This is test op"); + } +}; + +class InferShapeUtilsTestOp : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + return OpKernelType(proto::VarType::FP32, ctx.GetPlace()); + } +}; + +pten::KernelSignature InferShapeUtilsTestOpArgumentMapping( + const pten::ArgumentMappingContext& ctx) { + return pten::KernelSignature( + "infer_shape_utils_test", {}, + {"bool", "int", "int64", "float", "string", "vec_bool", "vec_int", + "vec_int64", "vec_float", "vec_double", "vec_str"}, + {}); +} + +} // namespace framework +} // namespace paddle + +DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, + InferShapeUtilsTestInferShapeFunctor, + PT_INFER_META(paddle::framework::TestInferMeta)); +REGISTER_OPERATOR(infer_shape_utils_test, + paddle::framework::InferShapeUtilsTestOp, + paddle::framework::InferShapeUtilsTestOpMaker, + InferShapeUtilsTestInferShapeFunctor); + +TEST(InferShapeUtilsTest, ALL) { + paddle::framework::ProgramDesc prog; + paddle::framework::proto::BlockDesc proto_block; + paddle::framework::BlockDesc block_desc(&prog, &proto_block); + + auto* op = block_desc.AppendOp(); + op->SetType("infer_shape_utils_test"); + + paddle::framework::Attribute bool_attr(true); + op->SetAttr("bool", bool_attr); + + paddle::framework::Attribute int_attr(10); + op->SetAttr("int", int_attr); + + int64_t int64_val = 100; + paddle::framework::Attribute int64_attr(int64_val); + op->SetAttr("int64", int64_attr); + + float float_value = 3.14; + paddle::framework::Attribute float_attr(float_value); + op->SetAttr("float", float_attr); + + std::string str_value("test"); + paddle::framework::Attribute str_attr(str_value); + op->SetAttr("string", str_attr); + + std::vector vec_bool(2, true); + paddle::framework::Attribute vec_bool_attr = vec_bool; + op->SetAttr("vec_bool", vec_bool_attr); + + std::vector vec_int(2, 10); + paddle::framework::Attribute vec_int_attr = vec_int; + op->SetAttr("vec_int", vec_int_attr); + + std::vector vec_int64(2, 100); + paddle::framework::Attribute vec_int64_attr = vec_int64; + op->SetAttr("vec_int64", vec_int64_attr); + std::cout << "after set vec_int64" << std::endl; + + std::vector vec_float(2, 3.14); + paddle::framework::Attribute vec_float_attr = vec_float; + op->SetAttr("vec_float", vec_float_attr); + + std::vector vec_double(2, 3.1415); + paddle::framework::Attribute vec_double_attr = vec_double; + op->SetAttr("vec_double", vec_double_attr); + + std::vector vec_str(2, "test_vec"); + paddle::framework::Attribute vec_str_attr = vec_str; + op->SetAttr("vec_str", vec_str_attr); + + pten::OpUtilsMap::Instance().InsertArgumentMappingFn( + "infer_shape_utils_test", + paddle::framework::InferShapeUtilsTestOpArgumentMapping); + + op->InferShape(block_desc); +} diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 829f43effb6d2878b63694d25b23ff7396ff61c2..0e1e572a51f7fcbc84415bab3808dfaed97dfd08 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -103,6 +103,8 @@ target_link_libraries(generate_pass pass_desc_proto) if(WITH_TENSORRT) pass_library(trt_map_matmul_to_mul_pass inference) + pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference) + pass_library(preln_skip_layernorm_fuse_pass inference) endif() if(WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index 619976d45fb0d9675e09046f2fad8fc3bbf5d90a..b56c9cb13ccdc2dd1c7a1dfcd1aad6da27590cae 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/pten/core/kernel_factory.h" namespace paddle { namespace framework { @@ -271,25 +272,41 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU( if (op_type == "c_sync_calc_stream" || op_type == "c_sync_comm_stream") { return true; } - auto &all_kernels = OperatorWithKernel::AllOpKernels(); - auto it = all_kernels.find(op_type); - // skip op not has kernel - if (it != all_kernels.end()) { - bool support_cpu = false; - bool support_gpu = false; - for (auto &kernel_pair : it->second) { - if (platform::is_cpu_place(kernel_pair.first.place_)) { - support_cpu = true; - } - if (platform::is_gpu_place(kernel_pair.first.place_)) { - support_gpu = true; + bool support_cpu = false; + bool support_gpu = false; + auto &kernel_factory = pten::KernelFactory::Instance(); + auto kernel_key_map = + kernel_factory.SelectKernelMap(pten::TransToPtenKernelName(op_type)); + bool has_op_kernel = kernel_key_map.size() > 0 ? true : false; + for (auto &kernel : kernel_key_map) { + if (platform::is_gpu_place( + pten::TransToPtenPlace(kernel.first.backend()))) { + support_gpu = true; + } else if (platform::is_cpu_place( + pten::TransToPtenPlace(kernel.first.backend()))) { + support_cpu = true; + } + } + + if (!support_cpu || !support_gpu) { + auto &all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + // skip op not has kernel + if (it != all_kernels.end()) { + has_op_kernel = true; + for (auto &kernel_pair : it->second) { + if (platform::is_cpu_place(kernel_pair.first.place_)) { + support_cpu = true; + } else if (platform::is_gpu_place(kernel_pair.first.place_)) { + support_gpu = true; + } } } - VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu - << ", support GPU: " << support_gpu; - return support_cpu && support_gpu; } - return true; + + VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu + << ", support GPU: " << support_gpu; + return has_op_kernel ? (support_cpu && support_gpu) : true; } bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck( diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc index abed6a5bd4bc48e01d9bcf20abf1bed236ed847a..ed9f6230720f83100e641068c8664d643b6db260 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc @@ -26,7 +26,7 @@ USE_OP(mul); USE_OP(cinn_launch); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle::framework { using Name2VarInfoMap = diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index 746d90cef917cdb8c4740adf7dff3438c2ca1249..d33dc7f49feb0f4c9e585d13186d65b6c2d618c0 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -23,8 +23,8 @@ USE_OP_ITSELF(scale); USE_OP(elementwise_mul); -USE_OP(elementwise_add); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add); +USE_OP_ITSELF(elementwise_add_grad); DECLARE_double(eager_delete_tensor_gb); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 5f819ddbfaf8b88732b35119014c34644a1c402b..96aa95bde337436dd6eb584b3eea5395b5301a34 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -29,7 +29,7 @@ USE_OP(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 90dc7801131074868073e1307ae7bfc51f2c3631..ea335e9bd63c624310df2f092b13e30a9458bb93 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -22,7 +22,7 @@ USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 016d0fd4a663ecfcc8d2b23ddb2a3af7b610b6cd..acfe8d53cea13cb5ac9797ea7d43311d01b9041b 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -47,6 +47,8 @@ constexpr char kPassRecorder[] = "pass_recorder"; constexpr char kEmbEltwiseLayernormPass[] = "embedding_eltwise_layernorm_fuse_pass_flag"; constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag"; +constexpr char kPrelnEmbEltwiseLayernormPass[] = + "preln_embedding_eltwise_layernorm_fuse_pass_flag"; class Pass { public: diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca42a613411ba6078b00522d2c178856993fa462 --- /dev/null +++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc @@ -0,0 +1,450 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h" + +#include + +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name, + const std::string& arg, + bool is_persist = false) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + PDNode* node = + pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg); + if (is_persist) return node->assert_is_persistable_var(); + return node; +} +static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name, + const std::string& arg) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + PDNode* node = pattern->NewNode(name) + ->assert_is_only_output_of_ops(embedding_ops) + ->assert_is_op_input("elementwise_add", arg) + ->AsIntermediate(); + return node; +} +void PrelnEmbedding2Eltwise1Pattern::operator()() { + auto* lookup_table1_x = + create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); + auto* lookup_table2_x = + create_emb_vars(pattern, lookup_table2_x_repr(), "Ids"); + auto* lookup_table1_w = + create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + auto* lookup_table2_w = + create_emb_vars(pattern, lookup_table2_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + auto* lookup_table1 = + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); + auto* lookup_table2 = + pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops); + auto* lookup_table1_out = + create_emb_out_vars(pattern, lookup_table1_out_repr(), "X"); + auto* lookup_table2_out = + create_emb_out_vars(pattern, lookup_table2_out_repr(), "Y"); + auto* eltwise_add = + pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add"); + auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr()) + ->assert_is_op_output("elementwise_add"); + lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w}) + .LinksTo({lookup_table1_out}); + lookup_table2->LinksFrom({lookup_table2_x, lookup_table2_w}) + .LinksTo({lookup_table2_out}); + eltwise_add->LinksFrom({lookup_table1_out, lookup_table2_out}) + .LinksTo({eltwise_add_out}); +} +void PrelnEmbedding1Eltwise1Pattern::operator()() { + auto* lookup_table1_x = + create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); + auto* lookup_table1_w = + create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; + auto* lookup_table1 = + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); + auto* lookup_table1_out = + create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y"); + auto* eltwise_add = + pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add"); + auto* eltwise_add_in = pattern->NewNode(eltwise_add_in_repr()) + ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_output("elementwise_add"); + auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr()) + ->assert_is_op_output("elementwise_add"); + lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w}) + .LinksTo({lookup_table1_out}); + eltwise_add->LinksFrom({lookup_table1_out, eltwise_add_in}) + .LinksTo({eltwise_add_out}); +} +void PrelnSkipLayerNorm::operator()() { + auto* eltwise_add = + pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add"); + auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("layer_norm", "X") + ->assert_is_op_input("elementwise_add", "Y"); + auto* layer_norm = + pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm"); + auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr()) + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Bias"); + auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Scale"); + auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Mean"); + auto* layer_norm_variance_var = + pattern->NewNode(layer_norm_variance_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Variance"); + eltwise_add->LinksTo({eltwise_add_out}); + layer_norm + ->LinksFrom({eltwise_add_out, layer_norm_bias_var, layer_norm_scale_var}) + .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var}); +} + +} // namespace patterns + +int PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion( + Graph* graph, const std::string& name_scope + /*const Scope* scope*/) const { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + std::vector>> start_pattern_in_nodes; + std::vector start_pattern_out_node; + std::vector> start_pattern_remove_nodes; + + // Create pattern. + patterns::PrelnEmbedding2Eltwise1Pattern start_pattern(pattern, + name_scope + "/start"); + start_pattern(); + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_w, lookup_table2_w, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2, lookup_table2, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out, + start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_out, lookup_table2_out, + start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, start_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, start_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) + << "Pass(PrelnEmbedding2Eltwise1Pattern) in op compat failed."; + return; + } + std::vector> ins; + ins.push_back(std::make_pair(lookup_table1_x, lookup_table1_w)); + ins.push_back(std::make_pair(lookup_table2_x, lookup_table2_w)); + start_pattern_in_nodes.push_back(ins); + start_pattern_out_node.push_back(eltwise_add_out); + + std::unordered_set rm_nodes; + rm_nodes.insert({lookup_table1, lookup_table2, lookup_table1_out, + lookup_table2_out, eltwise_add, eltwise_add_out}); + start_pattern_remove_nodes.push_back(rm_nodes); + }; + gpd(graph, handler); + + std::vector> inner_pattern_ins; + std::vector inner_pattern_tmp_in; + std::vector inner_pattern_out; + std::vector> inner_pattern_remove_nodes; + + GraphPatternDetector gpd2; + auto* pattern2 = gpd2.mutable_pattern(); + patterns::PrelnEmbedding1Eltwise1Pattern second_pattern( + pattern2, name_scope + "/second"); + second_pattern(); + auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out, + second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_in, eltwise_add_in, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, second_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, second_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) + << "Pass(PrelnEmbedding1Eltwise1Pattern) in op compat failed."; + return; + } + auto in = std::make_pair(lookup_table1_x, lookup_table1_w); + inner_pattern_ins.push_back(in); + inner_pattern_tmp_in.push_back(eltwise_add_in); + inner_pattern_out.push_back(eltwise_add_out); + + std::unordered_set rm_nodes; + rm_nodes.insert({lookup_table1, lookup_table1_out, eltwise_add}); + inner_pattern_remove_nodes.push_back(rm_nodes); + }; + gpd2(graph, handler2); + + std::vector end_pattern_elt_out; + std::vector end_pattern_scales; + std::vector end_pattern_biases; + std::vector end_pattern_out; + std::vector end_patter_layernorms; + std::vector end_patter_elementwise; + std::vector> end_pattern_remove_nodes; + GraphPatternDetector gpd3; + auto* pattern3 = gpd3.mutable_pattern(); + patterns::PrelnSkipLayerNorm skip_layernorm_pattern(pattern3, + name_scope + "/third"); + skip_layernorm_pattern(); + auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, + skip_layernorm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, + skip_layernorm_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(PrelnSkipLayerNorm) in op compat failed."; + return; + } + end_pattern_elt_out.push_back(eltwise_add_out); + std::unordered_set rm_nodes; + rm_nodes.insert({layer_norm, layer_norm_mean, layer_norm_variance}); + end_pattern_remove_nodes.push_back(rm_nodes); + end_pattern_biases.push_back(layer_norm_bias); + end_pattern_scales.push_back(layer_norm_scale); + end_pattern_out.push_back(layer_norm_out); + end_patter_layernorms.push_back(layer_norm); + end_patter_elementwise.push_back(eltwise_add); + }; + gpd3(graph, handler3); + + if (start_pattern_in_nodes.empty() || end_pattern_elt_out.empty()) { + return 0; + } + // only reserve the subgraphs that in connected domains. + int fusion_count = 0; + // fusion_id for (i, k, js) + std::vector>>> + fusion_ids; + for (size_t i = 0; i < start_pattern_in_nodes.size(); ++i) { + Node* tmp = start_pattern_out_node[i]; + Node* old_tmp = nullptr; + // get correct inner pattern node order. + std::vector js; + while (tmp != old_tmp) { + old_tmp = tmp; + for (size_t j = 0; j < inner_pattern_tmp_in.size(); ++j) { + if (inner_pattern_tmp_in[j] == tmp) { + tmp = inner_pattern_out[j]; + js.push_back(j); + break; + } + } + } + + for (size_t k = 0; k < end_pattern_elt_out.size(); ++k) { + if (tmp == end_pattern_elt_out[k]) { + fusion_ids.push_back(std::make_pair(i, std::make_pair(k, js))); + break; + } + } + } + + for (size_t num = 0; num < fusion_ids.size(); ++num) { + int i = fusion_ids[num].first; + int k = fusion_ids[num].second.first; + std::vector js = fusion_ids[num].second.second; + + std::vector ids; + std::vector embs; + for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { + ids.push_back(start_pattern_in_nodes[i][iter].first->Name()); + embs.push_back(start_pattern_in_nodes[i][iter].second->Name()); + } + for (size_t iter = 0; iter < js.size(); ++iter) { + ids.push_back(inner_pattern_ins[js[iter]].first->Name()); + embs.push_back(inner_pattern_ins[js[iter]].second->Name()); + } + + OpDesc new_op_desc; + new_op_desc.SetType("fused_preln_embedding_eltwise_layernorm"); + new_op_desc.SetInput("Ids", ids); + new_op_desc.SetInput("Embs", embs); + new_op_desc.SetInput("WordId", {ids[0]}); + new_op_desc.SetInput("PosId", {ids[1]}); + if (ids.size() > 2) { + new_op_desc.SetInput("SentId", {ids[2]}); + } + + new_op_desc.SetInput("WordEmbedding", {embs[0]}); + new_op_desc.SetInput("PosEmbedding", {embs[1]}); + if (embs.size() > 2) { + new_op_desc.SetInput("SentEmbedding", {embs[2]}); + } + + new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()}); + new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()}); + new_op_desc.SetOutput("Out_0", {end_pattern_out[k]->Name()}); + new_op_desc.SetOutput("Out_1", {inner_pattern_out[k]->Name()}); + new_op_desc.SetAttr("epsilon", + end_patter_layernorms[k]->Op()->GetAttr("epsilon")); + + if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold") && + end_patter_elementwise[k]->Op()->HasAttr("out_threshold")) { + new_op_desc.SetAttr("enable_int8", true); + new_op_desc.SetAttr( + "out_0_threshold", + end_patter_layernorms[k]->Op()->GetAttr("out_threshold")); + new_op_desc.SetAttr( + "out_1_threshold", + end_patter_elementwise[k]->Op()->GetAttr("out_threshold")); + } + + auto* preln_embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc); + + for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { + IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].first, + preln_embedding_eltwise_layernorm); + IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].second, + preln_embedding_eltwise_layernorm); + } + for (size_t iter = 0; iter < js.size(); ++iter) { + IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].first, + preln_embedding_eltwise_layernorm); + IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].second, + preln_embedding_eltwise_layernorm); + } + IR_NODE_LINK_TO(end_pattern_biases[k], preln_embedding_eltwise_layernorm); + IR_NODE_LINK_TO(end_pattern_scales[k], preln_embedding_eltwise_layernorm); + IR_NODE_LINK_TO(preln_embedding_eltwise_layernorm, end_pattern_out[k]); + IR_NODE_LINK_TO(preln_embedding_eltwise_layernorm, inner_pattern_out[k]); + + // Remove unneeded nodes. + std::unordered_set marked_nodes; + marked_nodes.insert(start_pattern_remove_nodes[i].begin(), + start_pattern_remove_nodes[i].end()); + marked_nodes.insert(end_pattern_remove_nodes[k].begin(), + end_pattern_remove_nodes[k].end()); + for (size_t iter = 0; iter < js.size(); ++iter) { + marked_nodes.insert(inner_pattern_remove_nodes[js[iter]].begin(), + inner_pattern_remove_nodes[js[iter]].end()); + } + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + } + + return fusion_count; +} + +PrelnEmbeddingEltwiseLayerNormFusePass:: + PrelnEmbeddingEltwiseLayerNormFusePass() { + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); +} + +void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + int fusion_count = + PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_); + if (fusion_count > 0) { + graph->Set(kPrelnEmbEltwiseLayernormPass, new bool(true)); + } + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(preln_embedding_eltwise_layernorm_fuse_pass, + paddle::framework::ir::PrelnEmbeddingEltwiseLayerNormFusePass); +REGISTER_PASS_CAPABILITY(preln_embedding_eltwise_layernorm_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("lookup_table", 1) + .LE("lookup_table_v2", 1) + .LE("elementweise_add", 1)); diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..1ccc6c85d4860540dfa7a74911c6633180850344 --- /dev/null +++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.h @@ -0,0 +1,166 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { +class Graph; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +// detect start pattern. +// +// in_var emb in_var emb +// | | | | +// lookup_table lookup_table +// | | +// lkt_var lkt_var +// \ / +// elementwise_add +// | +// elt_out_var +// +struct PrelnEmbedding2Eltwise1Pattern : public PatternBase { + PrelnEmbedding2Eltwise1Pattern(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, "Prelnembedding2_eltwise1") {} + + void operator()(); + + PATTERN_DECL_NODE(lookup_table1_x); + PATTERN_DECL_NODE(lookup_table2_x); + PATTERN_DECL_NODE(lookup_table1_w); + PATTERN_DECL_NODE(lookup_table2_w); + PATTERN_DECL_NODE(lookup_table1); + PATTERN_DECL_NODE(lookup_table2); + PATTERN_DECL_NODE(lookup_table1_out); + PATTERN_DECL_NODE(lookup_table2_out); + PATTERN_DECL_NODE(eltwise_add); + PATTERN_DECL_NODE(eltwise_add_out); +}; + +// detect repeats inner pattern +// +// elt_out_var in_var emb +// \ | | +// \ lookup_table +// \ | +// \ lkt_var +// \ / +// elementwise_add +// | | +// elementwise_add elt_out_var +// +struct PrelnEmbedding1Eltwise1Pattern : public PatternBase { + PrelnEmbedding1Eltwise1Pattern(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, "Prelnembedding1_eltwise1") {} + void operator()(); + PATTERN_DECL_NODE(lookup_table1_x); + PATTERN_DECL_NODE(lookup_table1_w); + PATTERN_DECL_NODE(lookup_table1); + PATTERN_DECL_NODE(lookup_table1_out); + PATTERN_DECL_NODE(eltwise_add_in); + PATTERN_DECL_NODE(eltwise_add); + PATTERN_DECL_NODE(eltwise_add_out); +}; + +// detect end pattern +// +// elementwise_add +// | | +// | elt_out_var +// | scale | bias +// | \ | / +// elementwise_add layer_norm +// +struct PrelnSkipLayerNorm : public PatternBase { + PrelnSkipLayerNorm(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "Prelnskip_layernorm") {} + void operator()(); + PATTERN_DECL_NODE(eltwise_add); + PATTERN_DECL_NODE(eltwise_add_out); + PATTERN_DECL_NODE(layer_norm); + PATTERN_DECL_NODE(layer_norm_bias); + PATTERN_DECL_NODE(layer_norm_scale); + PATTERN_DECL_NODE(layer_norm_out); + // Delete the mean and var nodes in the graph. + PATTERN_DECL_NODE(layer_norm_mean); + PATTERN_DECL_NODE(layer_norm_variance); +}; +} // namespace patterns + +// The PrelnEmbeddingEltwiseLayerNormFusePass detect the following pattern: +// +// inputs operator output +// -------------------------------------------------------------------- +// (word, weights_0) lookup_table -> word_emb +// (pos, weights_1) lookup_table -> pos_emb +// (sent, weights_2) lookup_table -> sent_emb +// (word_emb, pos_emb) elementweise_add -> elementwise_out_0 +// (elemtwise_out_0, sent_emb) elementweise_add -> elementwise_out_1 +// (elementwise_out_1, scale, bias) layer_norm -> layer_norm_out +// +// and then convert the corresponding subgraph to: +// +// (word, pos, sent, weights_0, weights_1, weights_2, +// scale, baias) Prelnembedding_eltwise_layernorm -> layer_norm_out + +// elementwise_add_out +// +// +// in_var emb_var in_var emb_var in_var emb_var in_var emb_var +// | | | | | | | | +// lookup_table lookup_table lookup_table ... lookup_table +// | | | | +// lkt_var lkt_var lkt_var lkt_var +// \ / | ... | +// elementwise_add | | +// \ / | +// elementwise_add | +// | | +// elt_var / +// \ / +// elementwise_add +// | | +// elementwise_add layer_norm + +class PrelnEmbeddingEltwiseLayerNormFusePass : public FusePassBase { + public: + PrelnEmbeddingEltwiseLayerNormFusePass(); + virtual ~PrelnEmbeddingEltwiseLayerNormFusePass() {} + + protected: + void ApplyImpl(Graph* graph) const; + int BuildFusion(Graph* graph, const std::string& name_scope + /*const Scope* scope*/) const; + const std::string name_scope_{"preln_embedding_eltwise_layernorm_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1b7b82cbca9e86587467fa0888eca6c6fdc2e162 --- /dev/null +++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc @@ -0,0 +1,210 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h" + +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +struct PrelnSkipLayerNorm : public PatternBase { + PrelnSkipLayerNorm(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "preln_skip_layernorm") {} + + void operator()(PDNode *x, PDNode *y); + + // declare operator node's name + PATTERN_DECL_NODE(fused_skipe_layernorm); + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(layer_norm); + // declare variable node's name + PATTERN_DECL_NODE( + elementwise_out); // (elementwise_input_x,elementwise_input_y) -> + // elementwise_out + PATTERN_DECL_NODE(layer_norm_bias); + PATTERN_DECL_NODE(layer_norm_scale); + PATTERN_DECL_NODE(layer_norm_out); + PATTERN_DECL_NODE(layer_norm_mean); + PATTERN_DECL_NODE(layer_norm_variance); +}; + +void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) { + // Create nodes for elementwise add op. + x->assert_is_op_input("elementwise_add", "X"); + y->assert_is_op_input("elementwise_add", "Y"); + auto *elementwise = + pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add"); + auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("layer_norm", "X") + ->assert_is_op_input("elementwise_add", "Y"); + + // Add links for elementwise_add op. + elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var}); + + // Create nodes for layer_norm op. + auto *layer_norm = + pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm"); + auto *layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Bias"); + auto *layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Scale"); + + auto *layer_norm_out_var = pattern->NewNode(layer_norm_out_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Y"); + auto *layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Mean"); + auto *layer_norm_variance_var = + pattern->NewNode(layer_norm_variance_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Variance"); + + // Add links for layer_norm op. + layer_norm + ->LinksFrom( + {elementwise_out_var, layer_norm_bias_var, layer_norm_scale_var}) + .LinksTo( + {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var}); +} + +} // namespace patterns + +void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + FusePassBase::Init("preln_skip_layernorm_fuse", graph); + int found_subgraph_count = 0; + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("preln_skip_layernorm_fuse/x") + ->AsInput() + ->assert_is_op_input("elementwise_add", "X") + ->assert_var_not_persistable(); + auto *y = gpd.mutable_pattern() + ->NewNode("preln_skip_layernorm_fuse/y") + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y") + ->assert_var_not_persistable(); + patterns::PrelnSkipLayerNorm fused_pattern(gpd.mutable_pattern(), + "preln_skip_layernorm_fuse"); + fused_pattern(x, y); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *graph) { + if (subgraph.count(x) <= 0 || subgraph.count(y) <= 0) { + LOG(WARNING) << "The subgraph is empty."; + return; + } + + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "preln_skip_layernorm pass in op compat failed."; + return; + } + + VLOG(4) << "handle PrelnSkipLayerNorm fuse"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale, + fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, + fused_pattern); + + std::unordered_set del_node_set; + + // Create an PrelnSkipLayerNorm op node + OpDesc new_desc; + new_desc.SetType("preln_skip_layernorm"); + + // inputs + new_desc.SetInput("X", {subgraph.at(x)->Name()}); + new_desc.SetInput("Y", {subgraph.at(y)->Name()}); + new_desc.SetInput("Scale", {layer_norm_scale->Name()}); + new_desc.SetInput("Bias", {layer_norm_bias->Name()}); + + if (elementwise->Op()->HasAttr("out_threshold") && + layer_norm->Op()->HasAttr("out_threshold")) { + new_desc.SetAttr("enable_int8", true); + new_desc.SetAttr("out_0_threshold", + layer_norm->Op()->GetAttr("out_threshold")); + new_desc.SetAttr("out_1_threshold", + elementwise->Op()->GetAttr("out_threshold")); + } + + // outputs + new_desc.SetOutput("Out_0", {layer_norm_out->Name()}); + new_desc.SetOutput("Out_1", {elementwise_out->Name()}); + + // attrs + new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon")); + new_desc.SetAttr("begin_norm_axis", + layer_norm->Op()->GetAttr("begin_norm_axis")); + + auto fused_node = graph->CreateOpNode(&new_desc); // OpDesc will be copied. + + del_node_set.insert(elementwise); + del_node_set.insert(layer_norm); + del_node_set.insert(layer_norm_mean); + del_node_set.insert(layer_norm_variance); + GraphSafeRemoveNodes(graph, del_node_set); + + IR_NODE_LINK_TO(subgraph.at(x), fused_node); + IR_NODE_LINK_TO(subgraph.at(y), fused_node); + IR_NODE_LINK_TO(layer_norm_scale, fused_node); + IR_NODE_LINK_TO(layer_norm_bias, fused_node); + IR_NODE_LINK_TO(fused_node, layer_norm_out); + IR_NODE_LINK_TO(fused_node, elementwise_out); + + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(preln_skip_layernorm_fuse_pass, + paddle::framework::ir::PrelnSkipLayerNormFusePass); +REGISTER_PASS_CAPABILITY(preln_skip_layernorm_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .EQ("layer_norm", 0)); diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..52447bfd8d3f1b8cb56080d8fd753a559477c783 --- /dev/null +++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +// | | | | +// other_op1 other_op2 other_op1 other_op2 +// | | fuse \ / +// |------elementwise_add -> skip_layernorm +// | | | | +// other_op4 layer_norm other_op4 other_op3 +// | +// other_op3 +class Graph; + +class PrelnSkipLayerNormFusePass : public FusePassBase { + public: + PrelnSkipLayerNormFusePass() { + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({0, -1}) + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); + } + + virtual ~PrelnSkipLayerNormFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc index c917630666b082ab7148550707f9f1f720aa25d3..2f3c3f3d06e327bc583c817bdfcc78345d8adff5 100644 --- a/paddle/fluid/framework/naive_executor_test.cc +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -67,4 +67,4 @@ TEST(NaiveExecutor, Basic) { } // namespace framework } // namespace paddle -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index b42f2da2a4d78b2913aedd01172771ce51926a2a..a0708f28e37ee2088d82f1b73b79f1452dc0f262 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -25,12 +25,12 @@ USE_OP(fill_constant); USE_OP(uniform_random); USE_OP(lookup_table); USE_OP(transpose2); -USE_OP(reshape2); +USE_OP_ITSELF(reshape2); USE_OP(split); USE_OP(slice); USE_OP(concat); USE_OP(matmul); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(sigmoid); USE_OP(tanh); USE_OP(elementwise_mul); @@ -39,9 +39,9 @@ USE_OP(reduce_mean); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); USE_OP(reduce_mean_grad); -USE_OP(reshape2_grad); +USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add_grad); USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc index 7dac6a092d245fab3781c0af0bb6d4162b5be47c..9d1f09869988df96205cad5cc29aba8ea7edd945 100644 --- a/paddle/fluid/framework/op_kernel_type.cc +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { "Too many OpKernel attribute values, expected maximum " "value is 64, received value is %d.", cur_loc)); - +#ifdef PADDLE_WITH_CUSTOM_DEVICE + std::hash hasher; + size_t seed = + hasher(place + data_type + data_layout + library_type + customized_value); + if (platform::is_custom_place(key.place_)) { + seed ^= std::hash{}(key.place_.GetDeviceType()) + 0x9e3779b9 + + (seed << 6) + (seed >> 2) + 4; + } + return seed; +#else std::hash hasher; return hasher(place + data_type + data_layout + library_type + customized_value); +#endif } bool OpKernelType::operator==(const OpKernelType& o) const { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 4670f043102d917f770b6fa5ca661a860941df33..7ab4e2acecfccd913343fc453338a26ddd9c92dd 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/pten/common/scalar.h" @@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #else auto dev_id = place.device; platform::SetMLUDeviceId(dev_id); +#endif + } else if (platform::is_custom_place(place)) { +#ifndef PADDLE_WITH_CUSTOM_DEVICE + PADDLE_THROW(platform::errors::Unavailable( + "Cannot run operator on place %s, please recompile paddle or " + "reinstall Paddle with CustomDevice support.", + place)); +#else + platform::DeviceManager::SetDevice(place); #endif } @@ -1326,8 +1336,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( const ExecutionContext& ctx) const { - auto& dev_ctx = ctx.device_context(); - auto expected_kernel_key = this->GetExpectedKernelType(ctx); if (HasAttr("op_device")) { if (Attr("op_device") == "cpu") { @@ -1344,12 +1352,20 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel // will be executed and a warning will be given at the same time. + expected_kernel_key.place_ = platform::CPUPlace(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (SupportGPU()) { + auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); - } else if (SupportNPU()) { + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + if (SupportNPU()) { + auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); - } else { - expected_kernel_key.place_ = platform::CPUPlace(); + } +#endif + if (platform::is_cpu_place(expected_kernel_key.place_)) { LOG_FIRST_N(WARNING, 1) << "Op(" << type_ << ") has no CUDA implementation. It will be assigned to CPUPlace."; @@ -1924,12 +1940,10 @@ Scope* OperatorWithKernel::PreparePtenData( for (size_t i = 0; i < input_defs.size(); ++i) { auto& in_def = input_defs.at(i); - auto it = ctx->inputs.find(input_names[i]); - if (it == ctx->inputs.end()) { + if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) { continue; } - - auto& ins_vector = it->second; + auto& ins_vector = ctx->inputs.at(input_names[i]); auto& name_vec = name_map.at(input_names[i]); bool should_skip_input = no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0; @@ -1940,7 +1954,6 @@ Scope* OperatorWithKernel::PreparePtenData( if (var == nullptr || !VarIsTensor(*var)) { continue; } - auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); // When no_buffer_ins then checking of Tensor::holder_ is @@ -2165,6 +2178,8 @@ void OperatorWithKernel::BuildPtenKernelContext( pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { + pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bca6a0a4cb8e0d61574f2b7be00e1f67b70ec035..79e6da987ef09db5ed43dfb8168dd13fa0cf885e 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -661,6 +661,6 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); USE_OP(relu); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(relu_grad); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 535c9ab58e295fae2048bb162adfb0384745d0ae..c62ece7f0dccc2612b6b53371805d29375416772 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -88,7 +88,7 @@ const CinnCompiledObject& CinnCompiler::Compile( if (cache_by_struct_.count(cur_key_by_struct) != 0) { exist = true; cache_by_address_[cur_key_by_address] = - cache_by_struct_.at(cur_key_by_struct).get(); + cache_by_struct_.at(cur_key_by_struct); } } } @@ -98,12 +98,13 @@ const CinnCompiledObject& CinnCompiler::Compile( CompileGraph(graph, input_tensors, target, compiled_num, stream); pten::AutoWRLock w_guard{&rwlock_}; if (!cache_by_struct_.count(cur_key_by_struct)) { - cache_by_address_[cur_key_by_address] = compiled_res.get(); - cache_by_struct_[cur_key_by_struct] = std::move(compiled_res); + cache_by_address_[cur_key_by_address] = compiled_num; + cache_by_struct_[cur_key_by_struct] = compiled_num; + index2cache_.emplace(compiled_num, std::move(compiled_res)); } } pten::AutoRDLock guard{&rwlock_}; - const auto& cached_boj = *cache_by_address_[cur_key_by_address]; + const auto& cached_boj = *index2cache_[cache_by_address_[cur_key_by_address]]; return cached_boj; } @@ -115,6 +116,15 @@ const CinnCompiledObject& CinnCompiler::Compile( return Compile(graph, input_tensors, target, stream); } +const CinnCompiledObject& CinnCompiler::GetCompiledObject( + int64_t cached_index) const { + auto res = index2cache_.find(cached_index); + PADDLE_ENFORCE_NE(res, index2cache_.end(), + platform::errors::InvalidArgument( + "Index(%ld) not found in cache", cached_index)); + return *res->second; +} + std::string CinnCompiler::AddGraph(std::unique_ptr graph) { std::string graph_key; ProgramDesc program; @@ -202,6 +212,7 @@ void CinnCompiler::Clear() { graphs_.clear(); cache_by_address_.clear(); cache_by_struct_.clear(); + index2cache_.clear(); } real_compiled_num_.store(0); } @@ -240,6 +251,7 @@ std::unique_ptr CinnCompiler::CompileGraph( compiled_obj->launch_context = std::make_unique( compiled_obj->paddle2cinn_varmap, compiled_obj->scope); + compiled_obj->cached_index = compiled_num; return compiled_obj; } diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 91a7b4e5a11f0054112df9645c4f8b8f3c22501b..d7ae743111ea73fe9d931a79e89cb08a406b60ce 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -53,6 +53,7 @@ struct CinnCompiledObject { std::shared_ptr<::cinn::hlir::framework::Scope> scope; std::unordered_map paddle2cinn_varmap; std::unique_ptr launch_context; + std::int64_t cached_index; }; // Entrance to use CINN. @@ -76,6 +77,8 @@ class CinnCompiler { const std::map& input_tensors, const ::cinn::common::Target& target, void* stream = nullptr); + const CinnCompiledObject& GetCompiledObject(int64_t cached_index) const; + std::string AddGraph(std::unique_ptr graph); const ir::Graph& FindGraph(const std::string& graph_key) const; @@ -101,12 +104,12 @@ class CinnCompiler { void* stream = nullptr) const; std::unordered_map> graphs_; - std::unordered_map + std::unordered_map cache_by_address_; - std::unordered_map, CinnCacheKey::Hash> + std::unordered_map cache_by_struct_; + std::unordered_map> + index2cache_; std::atomic_int64_t real_compiled_num_{0}; mutable pten::RWLock rwlock_; diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index 6769413d99bafd7a26a3486da6928d06ad920ace..05cd9e8a2e8a0d9fb533d9b92b7e1c9d7742629b 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -270,13 +270,20 @@ TEST(CinnCompilerTest, Compile) { auto compile_fn = [&](const Target& target) { const auto& compiled_obj = cinn_compiler->Compile(compiling_graph, input_tensors, target); + ASSERT_NE(compiled_obj.compiler, nullptr); ASSERT_NE(compiled_obj.runtime_program, nullptr); ASSERT_NE(compiled_obj.scope, nullptr); ASSERT_FALSE(compiled_obj.paddle2cinn_varmap.empty()); + ASSERT_NE(compiled_obj.launch_context, nullptr); const auto& cached_obj = cinn_compiler->Compile(compilation_key, input_tensors, target); ASSERT_EQ(reinterpret_cast(&compiled_obj), reinterpret_cast(&cached_obj)); + ASSERT_EQ(cached_obj.cached_index + 1, cinn_compiler->real_compiled_num()); + const auto& ret_obj = + cinn_compiler->GetCompiledObject(cached_obj.cached_index); + ASSERT_EQ(reinterpret_cast(&compiled_obj), + reinterpret_cast(&ret_obj)); }; // GPU Compilation @@ -295,4 +302,4 @@ USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); USE_OP(relu); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index aed5e2c7405ac0782ef3d9438b4958432584525a..1a826f6bdd5e7344d9983c026fc2d4cc8812d15a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use XPU device since it's not compiled with XPU," "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (platform::is_custom_place(place)) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + if (IsFastEagerDeletionModeEnabled()) { + gc.reset( + new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size)); + } else { + gc.reset(new CustomStreamGarbageCollector(place, max_memory_size)); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use custom device since it's not compiled with " + "CustomDevice," + "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else if (platform::is_cpu_place(place)) { gc.reset(new CPUGarbageCollector(place, max_memory_size)); diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index ea2e62d89f63da2bfe7e49c34e8aecad4e6138e0..2d2cc30497e288046256af5564620d40913cf3bf 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -186,8 +186,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { } KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() { - return KernelSignature(op_proto_->type(), GetInputArgsNames(), - GetAttrsArgsNames(), GetOutputArgsNames()); + return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()), + GetInputArgsNames(), GetAttrsArgsNames(), + GetOutputArgsNames()); } std::once_flag kernel_sig_map_init_flag; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 844b5d82695009415815eaba819cf6a8bf5a89e3..e510257c6106b8d3540e927f0e6fd76a9e73ea09 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #endif - +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_custom_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_custom_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } +#endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) || - platform::is_mlu_place(dst_place)) { + platform::is_mlu_place(dst_place) || + platform::is_custom_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { dev_ctx = pool.Get(src.place()); @@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/ + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/ + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } + else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_custom_place( + dst_place)) { /* custom_device -> custom_device*/ + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } +#endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor { const platform::CUDAPinnedPlace& cpu) const { return *out.data(); } + + bool GetResult(const framework::Tensor& out, + const platform::CustomPlace& custom_dev) const { + PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ", + custom_dev)); + return false; + } }; template @@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> { out_ptr[i] = lhs && rhs; } } + + void VisitorImpl(const platform::CustomPlace& custom_dev) const { + PADDLE_THROW( + platform::errors::Unimplemented("CustomPlace is not supported")); + } }; void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { @@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported when not compiled with NPU")); +#endif + } else if (platform::is_custom_place(tensor.place())) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& custom_device_context = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), tensor.place(), + reinterpret_cast(data), size_to_write, + custom_device_context.stream()); + custom_device_context.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CustomPlace is not supported when not compiled with " + "CustomDevice")); #endif } else { os.write(static_cast(data_ptr), @@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_mlu_place(dev_ctx.GetPlace()) || - platform::is_npu_place(dev_ctx.GetPlace())) { + platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); - if (platform::is_npu_place(dev_ctx.GetPlace())) { + if (platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { dev_ctx.Wait(); } #else @@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_mlu_place(dev_ctx.GetPlace()) || - platform::is_npu_place(dev_ctx.GetPlace())) { + platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); - if (platform::is_npu_place(dev_ctx.GetPlace())) { + if (platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { dev_ctx.Wait(); } #else @@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, } else if (platform::is_mlu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "MLUPlace is not supported when not compiled with MLU")); - } else { + } else if (platform::is_npu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported when not compiled with NPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "CutomPlace is not supported when not compiled with CustomDevice")); } #endif } else { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index bcaf3c719cb720d76c78a2b15475652eda793cad..1c1a86f1d32d3c3553e2201432453e5e2fdaa1e3 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(dst_place)) { // NOLINT + memory::Copy( + dst_place, dst_ptr, src_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorFromArray on %s is not supported.", dst_place)); + } } template @@ -241,6 +252,17 @@ void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(dst_place)) { // NOLINT + memory::Copy( + dst_place, dst_ptr, src_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorFromVector on %s is not supported.", dst_place)); + } } // The fully specialized function should be inline to avoid @@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEICE + else if (platform::is_custom_place(dst_place)) { // NOLINT + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorFromVector on %s is not supported.", dst_place)); + } delete[] array; } @@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorToVector on %s is not supported.", src.place())); + } } template <> @@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src, dst_place, dst_ptr, src.place(), src_ptr, size, reinterpret_cast(ctx).stream()); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); + } #endif for (unsigned int i = 0; i < src.numel(); i++) { (*dst)[i] = static_cast(array[i]); diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 89d9324039c15cecd8ba1518aae3645e2f540f9d..90cf0e76e000736f730121a6fcce841aa38a59ae 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -44,9 +44,9 @@ if(WITH_GLOO) endif() if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function pten_tensor) else() -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner pten_tensor) endif() add_subdirectory(tests) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 0913d54c8359aa48a1fd5213b87ddf632dc595d9..547fa02326bec36858717c8f66a268551423dbaa 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -340,8 +340,8 @@ NameVarMap AutoCastInputs(const std::string& op_type, } template NameVarMap AutoCastInputs( const std::string& op_type, const NameVarMap& ins); -template NameVarMap AutoCastInputs( - const std::string& op_type, const NameVarMap& ins); +template NameVarMap AutoCastInputs( + const std::string& op_type, const NameVarMap& ins); template NameVarMap CastPureFp16Inputs(const std::string& op_type, const NameVarMap& ins) { @@ -384,7 +384,7 @@ NameVarMap CastPureFp16Inputs(const std::string& op_type, } template NameVarMap CastPureFp16Inputs( const std::string& op_type, const NameVarMap& ins); -template NameVarMap CastPureFp16Inputs( - const std::string& op_type, const NameVarMap& ins); +template NameVarMap CastPureFp16Inputs( + const std::string& op_type, const NameVarMap& ins); } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index d57cb696526b490fac9d2610320ede8eef665d4f..17ab1f1f7c53fe69e07e04df4f98baaaf10d615f 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -35,6 +35,9 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif namespace paddle { namespace imperative { @@ -180,6 +183,12 @@ class TensorAddFunctor : public boost::static_visitor<> { "is not supported in imperative mode", place)); } + void operator()(const platform::CustomPlace& place) const { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } private: int64_t numel_; @@ -243,6 +252,23 @@ TType& GetInnerTensor(const paddle::experimental::Tensor& src) { return *src_tensor; } +template +TType* GetEmptyInnerTensor(paddle::experimental::Tensor* dst) { + PADDLE_ENFORCE_EQ( + dst->defined(), false, + platform::errors::Fatal( + "The underlying Tensor implementation should be nullptr")); + dst->set_impl(std::make_shared()); + auto* dst_tensor = static_cast(dst->impl().get()); + return dst_tensor; +} + +template +TType* GetEmptyInnerTensor(paddle::imperative::VariableWrapper* dst) { + auto* dst_tensor = dst->MutableVar()->GetMutable(); + return dst_tensor; +} + template void TensorAdd(const VarType& src, VarType* dst) { pten::DenseTensor* dst_tensor = GetInnerMutableTensor(dst); @@ -314,7 +340,14 @@ void TensorAdd(const VarType& src, VarType* dst) { return; } #endif - +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::is_custom_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } +#endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(place)) { if (data_type == framework::DataTypeTrait::DataType()) { @@ -332,6 +365,35 @@ void TensorAdd(const VarType& src, VarType* dst) { } #endif +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place)) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::DeviceContext* ctx = pool.Get(place); + auto dev_ctx = dynamic_cast(ctx); + if (data_type == framework::DataTypeTrait::DataType()) { + dst_tensor->mutable_data(place); + } else if (data_type == + framework::DataTypeTrait::DataType()) { + dst_tensor->mutable_data(place); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } + static const float alpha = 1.f; + static const float beta = 1.f; + operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor); + operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd( + dev_ctx->cnnl_handle(), static_cast(&alpha), + src_tensor_desc.get(), operators::GetBasePtr(&src_tensor), nullptr, 0, + static_cast(&beta), dst_tensor_desc.get(), + operators::GetBasePtr(dst_tensor))); + return; + } +#endif + PADDLE_TENSOR_ADD(float); #ifndef PADDLE_WITH_XPU @@ -473,13 +535,14 @@ template void SelectedRowsAddTensor( // Note(chenweihang): when two selected rows need to be added, // adding one to another is not equal to merging two selected rows // to one then add it to a empty selected rows, the after is correct -// Note(chenweihang): when two selected rows need to be added, -// adding one to another is not equal to merging two selected rows -// to one then add it to a empty selected rows, the after is correct -std::shared_ptr SelectedRowsMerge( - const framework::Variable& src1, const framework::Variable& src2) { - auto& src_selected_rows1 = src1.Get(); - auto& src_selected_rows2 = src2.Get(); +template +std::shared_ptr SelectedRowsMerge(const VarType& src1, + const VarType& src2) { + const pten::SelectedRows& src_selected_rows1 = + GetInnerTensor(src1); + const pten::SelectedRows& src_selected_rows2 = + GetInnerTensor(src2); + auto place = src_selected_rows1.value().place(); auto data_type = framework::TransToProtoVarType(src_selected_rows1.value().dtype()); @@ -488,9 +551,10 @@ std::shared_ptr SelectedRowsMerge( std::vector src_selected_rows; src_selected_rows.emplace_back(&src_selected_rows1); src_selected_rows.emplace_back(&src_selected_rows2); - auto dst_var = std::make_shared("Temp"); - auto* dst_selected_rows = - dst_var->MutableVar()->GetMutable(); + + auto dst_var = std::make_shared("Temp"); + pten::SelectedRows* dst_selected_rows = + GetEmptyInnerTensor(dst_var.get()); #define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ @@ -515,12 +579,17 @@ std::shared_ptr SelectedRowsMerge( #endif #undef PADDLE_SELECTED_ROWS_ADD - PADDLE_THROW(platform::errors::InvalidArgument( "Not supported data type %s for SelectedRowsMerge", framework::DataTypeToString(data_type))); } +template std::shared_ptr SelectedRowsMerge( + const paddle::experimental::Tensor& src1, + const paddle::experimental::Tensor& src2); +template std::shared_ptr SelectedRowsMerge( + const framework::Variable& src1, const framework::Variable& src2); + void VariableWrapperAdd(std::shared_ptr var, VariableWrapper* dst_var, bool unchange_input) { auto& src = var->Var(); @@ -547,7 +616,7 @@ void VariableWrapperAdd(std::shared_ptr var, *dst = std::move(*(var->MutableVar())); } } else if (src.IsType()) { - auto temp = SelectedRowsMerge(src, *dst); + auto temp = SelectedRowsMerge(src, *dst); *dst = std::move(*(temp->MutableVar())); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -603,7 +672,7 @@ void GradientAccumulator::AccumulateGrad() { SelectedRowsAddToTensor(*dst, src); *dst = std::move(*src); } else if (src->IsType()) { - auto temp = SelectedRowsMerge(*src, *dst); + auto temp = SelectedRowsMerge(*src, *dst); *dst = std::move(*(temp->MutableVar())); } } else { diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index 6371f64fe61044d6cc9ea8a10e5dbcacd3d187e4..ee2df582e81ee5cefe1faf9f3700b91c6adae434 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -17,10 +17,10 @@ #include #include #include - #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/pten/api/include/tensor.h" namespace paddle { namespace imperative { @@ -164,6 +164,10 @@ class SortedGradientAccumulator : public GradientAccumulator { std::vector tmp_grad_vars_; }; +template +std::shared_ptr SelectedRowsMerge(const VarType& src1, + const VarType& src2); + template void SelectedRowsAddToTensor(const VarType& src, VarType* dst); diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index b8c423f77bd235693f8bbf90a00630a8c855e00f..ed455b7fd0314e6d1e5cd38107568d5f8e89f84d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -177,9 +177,9 @@ std::string LayerDebugString(const std::string& op_type, } std::string LayerDebugString(const std::string& op_type, - const NameVarMap& ins, - const NameVarMap& outs) { - return LayerDebugStringImpl(op_type, ins, outs); + const NameVarMap& ins, + const NameVarMap& outs) { + return LayerDebugStringImpl(op_type, ins, outs); } template @@ -194,11 +194,16 @@ static void SetForwardDataTypeOfGradVars(const NameVarMap& outs) { } } template <> -void SetForwardDataTypeOfGradVars( - const NameVarMap& outs) { +void SetForwardDataTypeOfGradVars( + const NameVarMap& outs) { // In eager mode we don't need this. } +void TestSetForwardDataTypeOfGradVarsEager( + const NameVarMap& outs) { + SetForwardDataTypeOfGradVars(outs); +} + VarBase::VarBase(const std::shared_ptr& var) : var_(var), grad_node_(var->GetGradNode()) { if (auto grad_var = var_->GetGradVar()) { @@ -528,12 +533,12 @@ void OpBase::Run(const framework::OperatorBase& op, } void OpBase::Run(const framework::OperatorBase& op, - const NameVarMap& ins, - const NameVarMap& outs, + const NameVarMap& ins, + const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); } void ClearNoNeedBufferInputs(OpBase* op) { diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index 58c77d0f4b6b7b7328b5d877f5a97410728ce39e..21167605d46029d2eb9d1ea3241f8d868a6a8344 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -185,8 +185,8 @@ class OpBase { const framework::AttributeMap& default_attrs, const platform::Place& place); static void Run(const framework::OperatorBase& op, - const NameVarMap& ins, - const NameVarMap& outs, + const NameVarMap& ins, + const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index c8ff561f7af3ad85d74eb7723b092a2a9aeaae64..c56f82d0bc08429afa288bf24cd59d264af3e2ce 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -89,11 +89,16 @@ void HandleComplexGradToRealGrad(const NameVarMap& outs) { } template <> -void HandleComplexGradToRealGrad( - const NameVarMap& outs) { +void HandleComplexGradToRealGrad( + const NameVarMap& outs) { // TODO(jiabin): Support Complex here. } +void TestHandleComplexGradToRealGradEager( + const NameVarMap& outs) { + HandleComplexGradToRealGrad(outs); +} + PreparedOp::PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, @@ -278,6 +283,16 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (kernel_iter == kernels.end() && + paddle::platform::is_custom_place(expected_kernel_key.place_)) { + VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } #endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that // case @@ -312,14 +327,14 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, default_attrs); } -PreparedOp PreparedOp::Prepare(const NameVarMap& ins, - const NameVarMap& outs, +PreparedOp PreparedOp::Prepare(const NameVarMap& ins, + const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, - default_attrs); + return PrepareImpl(ins, outs, op, place, attrs, + default_attrs); } template static void PreparedOpRunImpl( @@ -451,18 +466,18 @@ void PreparedOp::Run(const NameVarMap& ins, } } -void PreparedOp::Run(const NameVarMap& ins, - const NameVarMap& outs, +void PreparedOp::Run(const NameVarMap& ins, + const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_pten_kernel_) { - PreparedOpRunPtImpl( + PreparedOpRunPtImpl( op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); } else { - PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, - dev_ctx_, ins, outs, attrs, - default_attrs); + PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, + dev_ctx_, ins, outs, attrs, + default_attrs); } } diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index d5dc53196dd7f1abe854785e0e5c1ccd363d1c3f..a6b80e0d4e1927a8012ff90d54ef71857d504fc6 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -63,8 +63,8 @@ void SetForwardDataTypeOfGradVar(const std::shared_ptr& var) { } template <> -void SetForwardDataTypeOfGradVar( - const std::shared_ptr& var) { +void SetForwardDataTypeOfGradVar( + const std::shared_ptr& var) { VLOG(10) << "Var in Eager dose not support SetForwardDataTypeOfGradVar: " << var->name(); // TODO(jiabin): SetForwardDataType of Grad var is not supported yet in @@ -171,8 +171,8 @@ class PreparedOp { const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs); - static PreparedOp Prepare(const NameVarMap& ins, - const NameVarMap& outs, + static PreparedOp Prepare(const NameVarMap& ins, + const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, @@ -187,8 +187,8 @@ class PreparedOp { const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs); - void Run(const NameVarMap& ins, - const NameVarMap& outs, + void Run(const NameVarMap& ins, + const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs); @@ -270,26 +270,26 @@ void BuildDygraphPtenKernelContext( kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - } else { - auto ins_vector = it->second; - size_t end_idx = start_idx + ins_vector.size(); - - for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - const pten::TensorBase* tensor_in = nullptr; - auto& var = ins_vector[offset]->Var(); - if (var.template IsType()) { - tensor_in = &(var.template Get()); - } else if (var.template IsType()) { - tensor_in = &(var.template Get()); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported input `%s` type when call pt kernel.", - framework::ToTypeName(var.Type()))); - } - kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + continue; + } + auto ins_vector = it->second; + size_t end_idx = start_idx + ins_vector.size(); + + for (size_t offset = 0; offset < ins_vector.size(); ++offset) { + const pten::TensorBase* tensor_in = nullptr; + auto& var = ins_vector[offset]->Var(); + if (var.template IsType()) { + tensor_in = &(var.template Get()); + } else if (var.template IsType()) { + tensor_in = &(var.template Get()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported input `%s` type when call pt kernel.", + framework::ToTypeName(var.Type()))); } - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -421,6 +421,8 @@ void BuildDygraphPtenKernelContext( kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); @@ -466,8 +468,7 @@ void PreparePtenData(const pten::Kernel& pt_kernel, for (size_t i = 0; i < input_names.size(); ++i) { auto& in_def = input_defs.at(i); - auto it = ins.find(input_names[i]); - if (it == ins.end()) { + if (ins.find(input_names[i]) == ins.end()) { continue; } auto& ins_vector = ins.at(input_names[i]); diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index 56eb47a2ef1719d3aad9eb10a47a46d06d0866d5..774bb9653e2cba5c27f9037ee905e70175375339 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -12,7 +12,7 @@ else() endif(WIN32) -cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function pten_tensor pten_api pten_api_utils) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc index d34cb924d566322a4d37555a64281688ae8a116d..57a2149b23c1bef678bc262d1bb009ed6cfeb572 100644 --- a/paddle/fluid/imperative/tests/test_eager.cc +++ b/paddle/fluid/imperative/tests/test_eager.cc @@ -31,8 +31,8 @@ namespace paddle { namespace imperative { extern std::string LayerDebugString(const std::string& op_type, - const NameVarMap& ins, - const NameVarMap& outs); + const NameVarMap& ins, + const NameVarMap& outs); extern std::shared_ptr CreateGradOpNode( const framework::OperatorBase& op, const NameTensorMap& ins, @@ -41,20 +41,21 @@ extern std::shared_ptr CreateGradOpNode( const std::map& inplace_map); TEST(test_eager, eager_debug) { - std::shared_ptr x_in(new egr::EagerTensor("x_in")); - std::shared_ptr y_in(new egr::EagerTensor("y_in")); - std::shared_ptr vout(new egr::EagerTensor("vout")); - imperative::NameVarMap ins = {{"X", {x_in}}, {"Y", {y_in}}}; - imperative::NameVarMap outs = {{"Out", {vout}}}; + std::shared_ptr x_in(new egr::EagerVariable("x_in")); + std::shared_ptr y_in(new egr::EagerVariable("y_in")); + std::shared_ptr vout(new egr::EagerVariable("vout")); + imperative::NameVarMap ins = {{"X", {x_in}}, + {"Y", {y_in}}}; + imperative::NameVarMap outs = {{"Out", {vout}}}; LayerDebugString("mul", ins, outs); } TEST(test_create_node, eager_node) { auto op = framework::OpRegistry::CreateOp("mul", {}, {}, {}, false); framework::Scope scope; auto ctx = framework::RuntimeContext({}, {}); - imperative::NameVarMap ins = {{"X", {nullptr}}, - {"Y", {nullptr}}}; - imperative::NameVarMap outs = {{"Out", {nullptr}}}; + imperative::NameVarMap ins = {{"X", {nullptr}}, + {"Y", {nullptr}}}; + imperative::NameVarMap outs = {{"Out", {nullptr}}}; CreateGradOpNode((*op.get()), ins, outs, framework::AttributeMap{}, framework::AttributeMap{}, platform::CPUPlace(), {}); } @@ -72,26 +73,26 @@ TEST(test_var_helper, eager_var_helper) { ASSERT_ANY_THROW( InitializeVariable(&var8, paddle::framework::proto::VarType::FP64)); - auto egr_tensor = std::make_shared(); - auto egr_tensor2 = std::make_shared(); + auto egr_tensor = std::make_shared(); + auto egr_tensor2 = std::make_shared(); egr_tensor->MutableVar() ->GetMutable() ->mutable_value() ->mutable_data(platform::CPUPlace()); egr_tensor2->MutableVar()->GetMutable(); VLOG(6) << "egr_tensor create with "; - ASSERT_TRUE(platform::is_cpu_place(GetPlace(egr_tensor))); - ASSERT_TRUE(GetDataType(egr_tensor) == + ASSERT_TRUE(platform::is_cpu_place(GetPlace(egr_tensor))); + ASSERT_TRUE(GetDataType(egr_tensor) == framework::proto::VarType::FP32); - GetCachedValue( + GetCachedValue( egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32, platform::CPUPlace())); - SetCachedValue( + SetCachedValue( egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32, platform::CPUPlace()), egr_tensor2); - ASSERT_ANY_THROW(GetPlace(egr_tensor2)); - ASSERT_ANY_THROW(SetType( + ASSERT_ANY_THROW(GetPlace(egr_tensor2)); + ASSERT_ANY_THROW(SetType( egr_tensor, paddle::framework::proto::VarType::LOD_TENSOR_ARRAY)); } } // namespace imperative diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 584f8ead3d8de40ed296da9e2f99845b9e7e5d3c..4dfc8198064e376edf55df9b4c51031344f71485 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -29,6 +29,57 @@ namespace framework = paddle::framework; namespace paddle { namespace imperative { +TEST(Test__SelectedRowsMerge_Test, SelectedRowsMerge) { + pten::CPUPlace cpu; + + std::vector rows{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + int64_t table_size = 10; + int64_t embedding_width = 10; + + auto sr1 = std::make_shared(rows, table_size); + auto sr2 = std::make_shared(rows, table_size); + + // initialize a sparse table 1 + sr1->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr1 = sr1->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr1[i * embedding_width + j] = static_cast(i); + } + } + + // initialize a sparse table 2 + sr2->mutable_value()->Resize( + pten::framework::make_ddim({table_size, embedding_width})); + auto* data_sr2 = sr2->mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data_sr2[i * embedding_width + j] = static_cast(i); + } + } + // new 2 pten::Tensor + paddle::experimental::Tensor t1(sr1); + paddle::experimental::Tensor t2(sr2); + + // call SelectedRowsMerge + auto new_buffer = + paddle::imperative::SelectedRowsMerge(t1, + t2); + auto* new_buffer_tensor = + static_cast(new_buffer->impl().get()); + auto* new_buffer_data_sr1 = + new_buffer_tensor->mutable_value()->mutable_data(cpu); + + // verify the MergeAdd result + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + EXPECT_EQ(new_buffer_data_sr1[i * embedding_width + j], + (static_cast(i) + static_cast(i))); + } + } +} + template int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) { framework::Variable var1; diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 3a0bb7c52bfe2eabb9e769cfd6c8d436df4a87e3..c99dbf1cf6258dd3bb1fbdd753b37adfb2736f14 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -265,5 +265,5 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) { USE_OP(mul); USE_OP(mul_grad); -USE_OP(elementwise_add); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add); +USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc index bcd4e62e57c270c5af0e6f5632fdc5f4f803fb29..224b8228097c475bac5bb1c62d126699d975ae66 100644 --- a/paddle/fluid/imperative/tests/test_layer.cc +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -39,6 +39,8 @@ using vb_vector = std::vector>; using var_pair = std::pair; +extern void TestSetForwardDataTypeOfGradVarsEager( + const NameVarMap& outs); template class TestRuntimeInferVarTypeContext : public RuntimeInferVarTypeContext { @@ -406,6 +408,11 @@ TEST(test_layer, test_inner_op_not_inited) { ASSERT_THROW(op.CheckAttrs(), platform::EnforceNotMet); } +TEST(test_layer, test_eager) { + imperative::NameTensorMap ins = {}; + TestSetForwardDataTypeOfGradVarsEager(ins); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index fa52aa6d0af61578e18d51e8b95c13b5d383c858..a440a1f486a0c75f299a7692b61b87d393780eb6 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -32,6 +32,9 @@ namespace framework = paddle::framework; namespace paddle { namespace imperative { +extern void TestHandleComplexGradToRealGradEager( + const NameVarMap& outs); + static framework::VariableNameMap CreateVarNameMap( const framework::OpInfo& op_info, const std::string& op_type, const NameVarBaseMap& varbase_map, bool is_input) { @@ -209,6 +212,11 @@ TEST(test_prepare_op, test_prepare_data_same_place) { TestPrepareDataSamePlace({}); } +TEST(test_prepare_op, test_complex_eager) { + NameVarMap outs = {}; + TestHandleComplexGradToRealGradEager(outs); +} + #ifdef PADDLE_WITH_MKLDNN TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { TestPrepareDataSamePlace({{"use_mkldnn", true}}); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index ff3331be56c3abe886496df95039c85073ed4777..ccce360269153ba2e8c6586b934f6a9bf6ace819 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -37,9 +37,10 @@ namespace paddle { namespace imperative { using vb_vector = std::vector>; - using var_pair = std::pair; +using ev_vector = std::vector>; +using ev_pair = std::pair; TEST(test_tracer, test_trace_op) { // Doing an mul imperative::Tracer tracer; @@ -546,6 +547,44 @@ TEST(test_tracer, test_execution_context) { ASSERT_EQ(dy_ctx.OutputName("Out"), framework::kEmptyVarName); } +TEST(test_tracer, eager_tracer) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in(new egr::EagerVariable("x_in")); + std::shared_ptr y_in(new egr::EagerVariable("y_in")); + std::shared_ptr vout(new egr::EagerVariable("vout")); + platform::CPUPlace place; + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + ev_pair x_pair = ev_pair("X", ev_vector(1, x_in)); + ev_pair y_pair = ev_pair("Y", ev_vector(1, y_in)); + ev_pair out_pair = ev_pair("Out", ev_vector(1, vout)); + imperative::NameTensorMap ins = {x_pair, y_pair}; + imperative::NameTensorMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + tracer.TraceOp("mul", ins, outs, mul_attr_map, place, + true); + + const auto& out_tensor = vout->Var().Get(); + for (int i = 0; i < vout->Var().Get().numel(); i++) { + ASSERT_EQ(out_tensor.data()[i], 20.0); + } +} + } // namespace imperative } // namespace paddle @@ -553,4 +592,4 @@ USE_OP(mul); USE_OP(mul_grad); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 81cd39c225b533d742d9eb399c8c87863a6572e5..a600720ef78edb5175bb7d17821f5d8e229d1a93 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use MLU device since it's not compiled with MLU," "Please recompile or reinstall Paddle with MLU support.")); +#endif + } else if (platform::is_custom_place(place)) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0)); + VLOG(10) << "Created GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use CustomDevice since it's not compiled with " + "CustomDevice," + "Please recompile or reinstall Paddle with CustomDevice " + "support.")); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -156,7 +168,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, const platform::Place& place, bool trace_backward, const std::map& inplace_map, paddle::framework::AttributeMap* passed_default_attrs_, - bool override_default_attr_map) { + bool use_default_attr_map) { platform::RecordEvent op_type_record_event(type); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; @@ -222,9 +234,17 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with MLU if use MLUPlace.")); +#endif + } else if (platform::is_custom_place(place)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + platform::DeviceManager::SetDevice(place); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with CustomDevice if use " + "CustomPlace.")); #endif } - if (!override_default_attr_map) { + if (!use_default_attr_map) { PADDLE_ENFORCE_NOT_NULL(passed_default_attrs_, paddle::platform::errors::PermissionDenied( "Detected default_attrs = nullptr.")); @@ -260,16 +280,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, } if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { - if (!override_default_attr_map) { - PADDLE_ENFORCE_NOT_NULL(passed_default_attrs_, - paddle::platform::errors::PermissionDenied( - "Detected default_attrs = nullptr.")); - CreateGradOpNode(*op, new_ins, outs, attrs, *passed_default_attrs_, place, - inplace_map); - } else { - CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, - inplace_map); - } + PADDLE_ENFORCE_EQ( + passed_default_attrs_, nullptr, + paddle::platform::errors::PermissionDenied( + "We expect passed_default_attrs_ is nullptr while " + "use_default_attr_map is true, however we got not null " + "passed_default_attrs_. Please check your usage of trace_op. ")); + CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, + inplace_map); } else { VLOG(3) << "No Grad to track for Op: " << type; } @@ -281,16 +299,14 @@ template void Tracer::TraceOp( const NameVarMap& outs, framework::AttributeMap attrs, const platform::Place& place, bool trace_backward, const std::map& inplace_map, - paddle::framework::AttributeMap* default_attrs, - bool override_default_attr_map); + paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map); -template void Tracer::TraceOp( - const std::string& type, const NameVarMap& ins, - const NameVarMap& outs, framework::AttributeMap attrs, +template void Tracer::TraceOp( + const std::string& type, const NameVarMap& ins, + const NameVarMap& outs, framework::AttributeMap attrs, const platform::Place& place, bool trace_backward, const std::map& inplace_map_, - paddle::framework::AttributeMap* default_attrs, - bool override_default_attr_map); + paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map); void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, const NameVarBaseMap& outs, framework::AttributeMap attrs, @@ -304,13 +320,12 @@ void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, paddle::framework::AttributeMap attrs, const paddle::platform::Place& place, paddle::framework::AttributeMap* default_attrs, - bool override_default_attr_map, + bool use_default_attr_map, const std::map& inplace_map) { - VLOG(6) << "Running On Eager TraceOp with override_default_attr_map: " - << override_default_attr_map; - TraceOp(type, ins, outs, std::move(attrs), place, false, - inplace_map, default_attrs, - override_default_attr_map); + VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: " + << use_default_attr_map; + TraceOp(type, ins, outs, std::move(attrs), place, false, + inplace_map, default_attrs, use_default_attr_map); } void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, @@ -318,8 +333,9 @@ void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, paddle::framework::AttributeMap attrs, const std::map& inplace_map) { VLOG(6) << "Running On Eager TraceOp(less): "; - TraceOp(type, ins, outs, std::move(attrs), expected_place_, - false, inplace_map, nullptr, true); + TraceOp(type, ins, outs, std::move(attrs), + expected_place_, false, inplace_map, nullptr, + true); } void Tracer::SetExpectedPlace(platform::Place place) { diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 4e406a9482da0da456ad43046e48b97232dff885..3a9a1b630ce9cbc89f57b746e6e1e1445f6bd318 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -69,7 +69,7 @@ class Tracer { const platform::Place& place, bool trace_backward, const std::map& inplace_map = {}, paddle::framework::AttributeMap* passed_default_attrs_ = nullptr, - bool override_default_attr_map = true); + bool use_default_attr_map = true); void TraceOp(const std::string& type, const NameVarBaseMap& ins, const NameVarBaseMap& outs, framework::AttributeMap attrs, @@ -83,7 +83,7 @@ class Tracer { const NameTensorMap& outs, paddle::framework::AttributeMap attrs, const paddle::platform::Place& place, paddle::framework::AttributeMap* default_attrs, - bool override_default_attr_map, + bool use_default_attr_map, const std::map& inplace_map = {}); bool ComputeRequiredGrad(const NameVarBaseMap& ins, diff --git a/paddle/fluid/imperative/var_helper.cc b/paddle/fluid/imperative/var_helper.cc index 3548f2eeafd24126b50329246dd85f2f0e47878b..d97f7c1ee19b33e75b11d8f7541e638c93d152f0 100644 --- a/paddle/fluid/imperative/var_helper.cc +++ b/paddle/fluid/imperative/var_helper.cc @@ -95,8 +95,8 @@ template const paddle::platform::Place &GetPlace( const std::shared_ptr &var); template const paddle::platform::Place &GetPlace( const std::shared_ptr &var); -template const paddle::platform::Place &GetPlace( - const std::shared_ptr &var); +template const paddle::platform::Place &GetPlace( + const std::shared_ptr &var); /* GetNameFromVar */ template @@ -104,8 +104,8 @@ const std::string &GetNameFromVar(std::shared_ptr var) { return var->Name(); } template <> -const std::string &GetNameFromVar( - std::shared_ptr tensor) { +const std::string &GetNameFromVar( + std::shared_ptr tensor) { return tensor->name(); } template const std::string &GetNameFromVar( @@ -120,8 +120,8 @@ void SetType(std::shared_ptr var, var->SetType(type); } template <> -void SetType(std::shared_ptr var, - framework::proto::VarType::Type type) { +void SetType(std::shared_ptr var, + framework::proto::VarType::Type type) { switch (type) { case paddle::framework::proto::VarType::LOD_TENSOR: { var->MutableVar()->GetMutable(); @@ -149,8 +149,8 @@ framework::proto::VarType::Type GetType(std::shared_ptr var) { return var->Type(); } template <> -framework::proto::VarType::Type GetType( - std::shared_ptr var) { +framework::proto::VarType::Type GetType( + std::shared_ptr var) { if (var->Var().IsInitialized()) { return paddle::framework::ToVarType(var->Var().Type()); } else { @@ -168,8 +168,8 @@ framework::proto::VarType::Type GetDataType(std::shared_ptr var) { return var->DataType(); } template <> -framework::proto::VarType::Type GetDataType( - std::shared_ptr var) { +framework::proto::VarType::Type GetDataType( + std::shared_ptr var) { if (var->Var().IsType()) { return framework::TransToProtoVarType( var->Var().Get().value().type()); @@ -197,8 +197,8 @@ bool CheckCachedKey(std::shared_ptr var, return GetVariableWrapper(var)->hasCacheKey(key); } template <> -bool CheckCachedKey( - std::shared_ptr tensor, +bool CheckCachedKey( + std::shared_ptr tensor, const paddle::framework::OpKernelType &key) { // TODO(jiabin): Support this later // VLOG(10) << "CheckCachedKey with tensor: " << tensor->name() << "and key is @@ -219,7 +219,7 @@ std::shared_ptr GetCachedValue( } template <> std::shared_ptr GetCachedValue( - std::shared_ptr var, + std::shared_ptr var, const paddle::framework::OpKernelType &key) { // TODO(jiabin): Support this later // PADDLE_THROW(platform::errors::Fatal("In eager mode program should not @@ -243,10 +243,10 @@ void SetCachedValue(std::shared_ptr var, GetVariableWrapper(var)->setCacheValue(key, GetVariableWrapper(res)); } template <> -void SetCachedValue( - std::shared_ptr tensor, +void SetCachedValue( + std::shared_ptr tensor, const paddle::framework::OpKernelType &key, - std::shared_ptr res) { + std::shared_ptr res) { // PADDLE_THROW(platform::errors::Fatal("In eager mode program should not // reach this, support cache and remove this error check later, or this // should not be supported.")); diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h index ff228e0ab84e2aec8d3d399bc1e5ba9cb14b42c2..cbcc1a9f99daaa16d0dfc5c79f610434dd4e33a5 100644 --- a/paddle/fluid/imperative/var_helper.h +++ b/paddle/fluid/imperative/var_helper.h @@ -18,7 +18,7 @@ #include "paddle/fluid/framework/variable.h" namespace egr { -class EagerTensor; +class EagerVariable; } // namespace egr namespace pten { class DenseTensor; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 904baebcb0be70b0d557a9431d1e8b969f0d74a2..e4fc52b6fa74427b1f24b194dffea6f39e2b4692 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -379,8 +379,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp( trt_engine->SetUseInspector(Get("use_inspector")); trt_engine->SetWithErnie( - graph->Has(framework::ir::kEmbEltwiseLayernormPass) && - graph->Has(framework::ir::kMultiheadMatmulPass)); + (graph->Has(framework::ir::kEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass)) || + (graph->Has(framework::ir::kPrelnEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass))); if (use_static_engine) { trt_engine_serialized_data = GetTrtEngineSerializedData( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index caac973d8b89a3ff1c605d81cb07bbdcb7a63304..7e4da57e9e7dfce3051d42183a8e89ebd04bd8f0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1470,6 +1470,8 @@ USE_TRT_CONVERTER(conv3d_transpose); USE_TRT_CONVERTER(mish); USE_TRT_CONVERTER(deformable_conv); USE_TRT_CONVERTER(pool3d) +USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm) +USE_TRT_CONVERTER(preln_skip_layernorm) #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 66b27b2903a70193f347d635ce7f863f8aa29b52..313e1f2faea553809cb6fce66ca9a751bace8d75 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -82,22 +82,24 @@ const std::vector kTRTSubgraphPasses({ "quant_conv2d_dequant_fuse_pass", // "delete_quant_dequant_op_pass", // "delete_quant_dequant_filter_op_pass", // - // "fc_fuse_pass", // - "simplify_with_basic_ops_pass", // - "embedding_eltwise_layernorm_fuse_pass", // - "multihead_matmul_fuse_pass_v2", // - "multihead_matmul_fuse_pass_v3", // - "skip_layernorm_fuse_pass", // - "conv_bn_fuse_pass", // - "unsqueeze2_eltwise_fuse_pass", // - "trt_squeeze2_matmul_fuse_pass", // - "trt_reshape2_matmul_fuse_pass", // - "trt_flatten2_matmul_fuse_pass", // - "trt_map_matmul_v2_to_mul_pass", // - "trt_map_matmul_v2_to_matmul_pass", // - "trt_map_matmul_to_mul_pass", // - "fc_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + // "fc_fuse_pass", // + "simplify_with_basic_ops_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "preln_embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "multihead_matmul_fuse_pass_v3", // + "skip_layernorm_fuse_pass", // + "preln_skip_layernorm_fuse_pass", // + "conv_bn_fuse_pass", // + "unsqueeze2_eltwise_fuse_pass", // + "trt_squeeze2_matmul_fuse_pass", // + "trt_reshape2_matmul_fuse_pass", // + "trt_flatten2_matmul_fuse_pass", // + "trt_map_matmul_v2_to_mul_pass", // + "trt_map_matmul_v2_to_matmul_pass", // + "trt_map_matmul_to_mul_pass", // + "fc_fuse_pass", // + "conv_elementwise_add_fuse_pass", // "add_support_int8_pass", "tensorrt_subgraph_pass", // "conv_bn_fuse_pass", // diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 017caca6adc814af32d6045ce0510099c5935ed8..e91faedb06872a5abe38c1de77b54477e0da8ef4 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -21,6 +21,8 @@ nv_library(tensorrt_converter nearest_interp_v2_op.cc pool3d_op.cc deformable_conv_op.cc + preln_emb_eltwise_layernorm.cc + preln_skip_layernorm.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index 0436499cd40756150d5b33c6d685d74ffbe5b87d..3e326414825d09d8611d5c845975ef31cf5c83ce 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -43,30 +43,161 @@ class GeluOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert fluid gelu op to tensorrt gelu layer"; - framework::OpDesc op_desc(op, nullptr); // Declare inputs - int input_num = op_desc.Input("X").size(); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { -#if IS_TRT_VERSION_GE(6000) - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::GeluPluginDynamic* plugin = - new plugin::GeluPluginDynamic(with_fp16); - layer = engine_->AddDynamicPlugin(&input, input_num, plugin); + if (op_desc.HasAttr("approximate") && + BOOST_GET_CONST(bool, op_desc.GetAttr("approximate"))) { +#if IS_TRT_VERSION_GE(7000) + nvinfer1::Dims input_shape; + input_shape.nbDims = input->getDimensions().nbDims; + for (int i = 0; i < input_shape.nbDims; ++i) { + input_shape.d[i] = 1; + } + std::string out_name = op_desc.Output("Out").front(); + auto create_weights = [&](float data, std::string type) -> float* { + std::unique_ptr tmp_tensor(new framework::Tensor()); + tmp_tensor->Resize({1}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + tmp_data[0] = data; + engine_->SetWeights(out_name + "_gelu_op_" + type, + std::move(tmp_tensor)); + return tmp_data; + }; + float* constant_pow = create_weights(3.0f, "constant_pow"); + float* constant_multiply = create_weights(0.044715f, "constant_multiply"); + float* constant_sqrt = + create_weights(0.79788456080286535587989211986876f, "constant_sqrt"); + float* constant_one = create_weights(1.0f, "constant_one"); + float* constant_half = create_weights(0.5f, "constant_half"); + auto constant_layer_pow = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_pow), 1}); + auto constant_layer_multiply = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_multiply), 1}); + auto constant_layer_sqrt = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_sqrt), 1}); + auto constant_layer_one = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_one), 1}); + auto constant_layer_half = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_half), 1}); + auto layer_pow = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *input, *constant_layer_pow->getOutput(0), + nvinfer1::ElementWiseOperation::kPOW); + auto layer_mul = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_pow->getOutput(0), + *constant_layer_multiply->getOutput(0), + nvinfer1::ElementWiseOperation::kPROD); + auto layer_add = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_mul->getOutput(0), + *input, nvinfer1::ElementWiseOperation::kSUM); + auto layer_sqrt = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_add->getOutput(0), + *constant_layer_sqrt->getOutput(0), + nvinfer1::ElementWiseOperation::kPROD); + auto layer_tanh = + TRT_ENGINE_ADD_LAYER(engine_, Activation, *layer_sqrt->getOutput(0), + nvinfer1::ActivationType::kTANH); + auto layer_one = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_tanh->getOutput(0), + *constant_layer_one->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + auto layer_CDF = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_one->getOutput(0), + *constant_layer_half->getOutput(0), + nvinfer1::ElementWiseOperation::kPROD); + auto y = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_CDF->getOutput(0), + *input, nvinfer1::ElementWiseOperation::kPROD); + layer = y; #else PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); + "You are running GeLU Op with approximate True, need to confirm that " + "your TRT version is no less than 7.0")); #endif } else { - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16); - layer = engine_->AddPlugin(&input, input_num, plugin); +#if IS_TRT_VERSION_GE(7000) + nvinfer1::Dims input_shape; + input_shape.nbDims = input->getDimensions().nbDims; + for (int i = 0; i < input_shape.nbDims; ++i) { + input_shape.d[i] = 1; + } + std::string out_name = op_desc.Output("Out").front(); + auto create_weights = [&](float data, std::string type) -> float* { + std::unique_ptr tmp_tensor(new framework::Tensor()); + tmp_tensor->Resize({1}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + tmp_data[0] = data; + engine_->SetWeights(out_name + "_gelu_op_" + type, + std::move(tmp_tensor)); + return tmp_data; + }; + float* constant_one = create_weights(1.0f, "constant_one"); + float* constant_half = create_weights(0.5f, "constant_half"); + float* constant_rsqrt2 = + create_weights(0.70710678118f, "constant_rsqrt2"); + auto constant_layer_one = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_one), 1}); + auto constant_layer_half = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_half), 1}); + auto constant_layer_rsqrt2 = TRT_ENGINE_ADD_LAYER( + engine_, Constant, input_shape, + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, + static_cast(constant_rsqrt2), 1}); + auto layer_mul = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *input, *constant_layer_rsqrt2->getOutput(0), + nvinfer1::ElementWiseOperation::kPROD); + auto layer_erf = + TRT_ENGINE_ADD_LAYER(engine_, Unary, *layer_mul->getOutput(0), + nvinfer1::UnaryOperation::kERF); + auto layer_add = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_erf->getOutput(0), + *constant_layer_one->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + auto layer_CDF = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_add->getOutput(0), + *constant_layer_half->getOutput(0), + nvinfer1::ElementWiseOperation::kPROD); + auto y = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *layer_CDF->getOutput(0), + *input, nvinfer1::ElementWiseOperation::kPROD); + layer = y; +#else // if IS_TRT_VERSION_GE(7000) + int input_num = op_desc.Input("X").size(); + if (engine_->with_dynamic_shape()) { +#if IS_TRT_VERSION_GE(6000) + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::GeluPluginDynamic* plugin = + new plugin::GeluPluginDynamic(with_fp16); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); +#else + PADDLE_THROW(platform::errors::Fatal( + "You are running the TRT Dynamic Shape mode, need to confirm that " + "your TRT version is no less than 6.0")); +#endif + } else { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16); + layer = engine_->AddPlugin(&input, input_num, plugin); + } +#endif // if IS_TRT_VERSION_GE(7000) } auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "gelu", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 26d87e4832f5f194ca88be41596d34c3226b0390..fe04d552e40263a396059e3da59de4d51def67e0 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -106,6 +106,9 @@ class Pool2dOpConverter : public OpConverter { reduce_operation = nvinfer1::ReduceOperation::kAVG; plugin_pool_type = plugin::PoolPlugin::PoolType::avg; } + if (global_pooling || adaptive) { + std::fill(paddings.begin(), paddings.end(), 0); + } if (padding_algorithm == "VALID") { std::fill(paddings.begin(), paddings.end(), 0); @@ -136,6 +139,46 @@ class Pool2dOpConverter : public OpConverter { #endif } + std::vector real_paddings = paddings; + for (int i = 0; i < 2; ++i) { + int copy_pad = *(paddings.begin() + i); + real_paddings.insert(real_paddings.begin() + 2 * i + 1, copy_pad); + } + // SAME + if (padding_algorithm == "SAME") { + // expand + for (int i = 0; i < 2; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + // compute + for (int i = 0; i < 2; ++i) { + int out_size = (input_shape.d[2 + i] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i] - input_shape.d[2 + i], 0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + paddings[i * 2] = pad_0; + paddings[i * 2 + 1] = pad_1; + } + real_paddings = paddings; + // slice + for (int i = 0; i < 2; ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + // VALID + if (padding_algorithm == "VALID") { + std::fill(real_paddings.begin(), real_paddings.end(), 0); + } + + if (global_pooling == true && !engine_->with_dynamic_shape()) { + nv_ksize.d[0] = input_shape.d[input_dims - 2]; + nv_ksize.d[1] = input_shape.d[input_dims - 1]; + ksize[0] = input_shape.d[input_dims - 2]; + ksize[1] = input_shape.d[input_dims - 1]; + } + if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { // input_shape.d < 0 means we can't get shape info here. @@ -173,15 +216,15 @@ class Pool2dOpConverter : public OpConverter { pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP); } layer = pool_layer; - } else if (global_pooling) { + } else if (global_pooling && !adaptive) { auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1, reduce_operation, 12, true); layer = reduce_layer; } else { #if IS_TRT_VERSION_GE(6000) - plugin::PoolPluginDynamic *plugin = - new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize, - strides, paddings, global_pooling); + plugin::PoolPluginDynamic *plugin = new plugin::PoolPluginDynamic( + ceil_mode, pool_type, adaptive, exclusive, ksize, strides, paddings, + global_pooling); layer = engine_->AddDynamicPlugin(&input1, 1, plugin); #endif } @@ -195,21 +238,13 @@ class Pool2dOpConverter : public OpConverter { return; } - if (global_pooling == true) { - nv_ksize.d[0] = input_shape.d[input_dims - 2]; - nv_ksize.d[1] = input_shape.d[input_dims - 1]; + if (global_pooling == true && adaptive == false) { auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); auto output_name = op_desc.Output("Out")[0]; - pool_layer->setStride(nv_strides); - pool_layer->setPadding(nv_paddings); - if (padding_algorithm == "SAME") { - pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); - } - pool_layer->setAverageCountExcludesPadding(exclusive); pool_layer->setName(("pool2d (Output: " + output_name + ")").c_str()); pool_layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, pool_layer->getOutput(0)); @@ -222,58 +257,61 @@ class Pool2dOpConverter : public OpConverter { if (!adaptive) { if (ceil_mode) { - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); - // If ceil mode is true, we will pad the appropriate size to the input. - DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, - input_dims); - auto *pad_layer = - TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad); - + std::vector input_shape_v; + for (int i = 0; i < input_dims; i++) { + input_shape_v.push_back(input_shape.d[i]); + } + plugin::PoolPlugin *plugin = new plugin::PoolPlugin( + ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides, + paddings, input_shape_v, real_paddings); + auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin); PADDLE_ENFORCE_NOT_NULL( - pad_layer, platform::errors::Fatal( - "Pad layer in poolOp converter could not be " - "created. The pointer to pad layer is `NULL`.")); - input1 = pad_layer->getOutput(0); - } + pool_layer, + platform::errors::Fatal( + "trt pool plugin layer in converter could not be created.")); + layer = pool_layer; + } else { #if IS_TRT_VERSION_GE(8000) - // Exclude padding pixels from the average mean is not supported well by - // TRT - // so enable padding for trt8.0 above. - if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && - (padding_algorithm != "SAME") && !ceil_mode) { - auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, - g_pre_pad, g_post_pad); - PADDLE_ENFORCE_NOT_NULL( - pad_layer, platform::errors::Fatal( - "Pad layer in poolOp converter could not be " - "created. The pointer to pad layer is `NULL`.")); - input1 = pad_layer->getOutput(0); - } + // Exclude padding pixels from the average mean is not supported well by + // TRT + // so enable padding for trt8.0 above. + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && + (padding_algorithm != "SAME") && !ceil_mode) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } #endif - auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, - nv_pool_type, nv_ksize); - PADDLE_ENFORCE_NOT_NULL( - pool_layer, platform::errors::Fatal( - "trt pool layer in converter could not be created.")); - pool_layer->setStride(nv_strides); - pool_layer->setPadding(nv_paddings); - if (padding_algorithm == "SAME") { - pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::Fatal( + "trt pool layer in converter could not be created.")); + pool_layer->setStride(nv_strides); + pool_layer->setPadding(nv_paddings); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } + pool_layer->setAverageCountExcludesPadding(exclusive); + layer = pool_layer; } - pool_layer->setAverageCountExcludesPadding(exclusive); - layer = pool_layer; + } else { // Average pooling needs to exclude the padding pixels from the average // mean. - // It is not supported well by TRT, we use a plugin here. + // It is not supported well by TRT, we use a plugin here std::vector input_shape_v; for (int i = 0; i < input_dims; i++) { input_shape_v.push_back(input_shape.d[i]); } - plugin::PoolPlugin *plugin = - new plugin::PoolPlugin(ceil_mode, plugin_pool_type, adaptive, ksize, - strides, paddings, input_shape_v); + plugin::PoolPlugin *plugin = new plugin::PoolPlugin( + ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides, + paddings, input_shape_v, real_paddings); auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin); PADDLE_ENFORCE_NOT_NULL( pool_layer, diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..50f90de85fd0494110b86dde743428a6b1844b57 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc @@ -0,0 +1,223 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { +#if IS_TRT_VERSION_GE(7000) + VLOG(4) << "convert fluid PrelnEmbEltwiseLayerNorm op to tensorrt layer"; + + if (!(engine_->use_oss() && engine_->with_interleaved())) { + PADDLE_THROW(platform::errors::Fatal( + "PrelnErnie: If you want to use oss, must be with interleaved")); + } + framework::OpDesc op_desc(op, nullptr); + bool enable_int8 = op_desc.HasAttr("enable_int8"); + if (!enable_int8) { + PADDLE_THROW( + platform::errors::Fatal("use with_interleaved must be int8.")); + } + auto word_id_name = op_desc.Input("WordId").front(); + auto pos_id_name = op_desc.Input("PosId").front(); + engine_->Set("ernie_pos_name", new std::string(pos_id_name)); + + auto sent_id_name = op_desc.Input("SentId").front(); + auto word_emb_name = op_desc.Input("WordEmbedding").front(); + auto pos_emb_name = op_desc.Input("PosEmbedding").front(); + auto sent_emb_name = op_desc.Input("SentEmbedding").front(); + + std::vector id_names; + std::vector emb_names; + + id_names = + std::vector{word_id_name, pos_id_name, sent_id_name}; + emb_names = + std::vector{word_emb_name, pos_emb_name, sent_emb_name}; + + int input_num = id_names.size(); + + // Declare inputs + std::vector input_ids; + for (int i = 0; i < input_num; i++) { + input_ids.push_back(engine_->GetITensor(id_names[i])); + } + + // input_embs[0]: word_embedding + // input_embs[1]: pos_embedding + // input_embs[2]: sent_embedding + std::vector input_embs; + std::vector emb_sizes; + + // get the presistable var's data + auto get_persistable_data = [&](const std::string& var_name, + framework::DDim* dims) -> float* { + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + (*dims) = temp_tensor->dims(); + + auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false); + return temp_data; + }; + + for (int i = 0; i < input_num; i++) { + framework::DDim emb_dims; + float* emb_data = get_persistable_data(emb_names[i], &emb_dims); + int64_t emb_size = framework::product(emb_dims); + input_embs.push_back(emb_data); + emb_sizes.push_back(emb_size); + PADDLE_ENFORCE_EQ( + emb_dims.size(), 2, + platform::errors::InvalidArgument( + "The fused PrelnEmbEltwiseLayerNorm's emb should be 2 dims.")); + } + + framework::DDim bias_dims, scale_dims; + + auto* bias = + get_persistable_data(op_desc.Input("Bias").front(), &bias_dims); + auto* scale = + get_persistable_data(op_desc.Input("Scale").front(), &scale_dims); + int64_t bias_size = framework::product(bias_dims); + int64_t scale_size = framework::product(scale_dims); + int output_int8 = 1; + + PADDLE_ENFORCE_EQ( + input_num, 3, + platform::errors::InvalidArgument( + "When using oss and var-len, embedding_eltwise_layernorm op" + "should have 3 inputs only, but got %d.", + input_num)); + const std::vector fields{ + {"bert_embeddings_layernorm_beta", bias, + nvinfer1::PluginFieldType::kFLOAT32, static_cast(bias_size)}, + {"bert_embeddings_layernorm_gamma", scale, + nvinfer1::PluginFieldType::kFLOAT32, static_cast(scale_size)}, + {"bert_embeddings_word_embeddings", input_embs[0], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[0])}, + {"bert_embeddings_token_type_embeddings", input_embs[2], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[2])}, + {"bert_embeddings_position_embeddings", input_embs[1], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[1])}, + {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + nvinfer1::PluginFieldCollection* plugin_ptr = + static_cast( + malloc(sizeof(*plugin_ptr) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_ptr->nbFields = static_cast(fields.size()); + plugin_ptr->fields = fields.data(); + + std::vector plugin_inputs; + plugin_inputs.emplace_back( + engine_->GetITensor(word_id_name)); // word_embedding, + // eval_placeholder_0 + plugin_inputs.emplace_back( + engine_->GetITensor(sent_id_name)); // sent_embedding, + // eval_placeholder_1 + plugin_inputs.emplace_back( + engine_->GetITensor(pos_id_name)); // cu_seqlens, + // eval_placeholder_2 + auto max_seqlen_tensor = + engine_->GetITensor(engine_->network()->getInput(3)->getName()); + auto* shuffle_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor); + nvinfer1::Dims shape_dim; + shape_dim.nbDims = 1; + shape_dim.d[0] = -1; + shuffle_layer->setReshapeDimensions(shape_dim); + shuffle_layer->setName( + ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " + + op_desc.Output("Out")[0] + ")") + .c_str()); + engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f); + plugin_inputs.emplace_back( + shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomEmbLayerNormPluginDynamic", "3"); + + auto plugin_obj = + creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); + plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " + + op_desc.Output("Out")[0] + ")") + .c_str()); + free(plugin_ptr); + float out_0_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_0_threshold")); + float out_1_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_1_threshold")); + engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_0_scale); + engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_1_scale); + + auto* shuffler_embed_out0 = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(0))); + nvinfer1::Permutation transpose_0{2, 1, 0, 3}; + shuffler_embed_out0->setSecondTranspose(transpose_0); + shuffler_embed_out0->getOutput(0)->setName( + op_desc.Output("Out_0")[0].c_str()); + engine_->SetITensor(op_desc.Output("Out_0")[0], + shuffler_embed_out0->getOutput(0)); + shuffler_embed_out0->setName( + ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_0: " + + op_desc.Output("Out_0")[0] + ")") + .c_str()); + + auto* shuffler_embed_out1 = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(1))); + nvinfer1::Permutation transpose_1{2, 1, 0, 3}; + shuffler_embed_out1->setSecondTranspose(transpose_1); + shuffler_embed_out1->getOutput(0)->setName( + op_desc.Output("Out_1")[0].c_str()); + + engine_->SetITensor(op_desc.Output("Out_1")[0], + shuffler_embed_out1->getOutput(0)); + shuffler_embed_out1->setName( + ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_1: " + + op_desc.Output("Out_1")[0] + ")") + .c_str()); + +#else + PADDLE_THROW(platform::errors::Fatal( + "PreInErnie want to use oss, must be with interleaved, " + "your TRT version is no less than 7.0")); +#endif + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fused_preln_embedding_eltwise_layernorm, + PrelnEmbEltwiseLayerNormOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..aa0d6fbe81376ed61992dbc6c15c69145aa98a4d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PrelnSkipLayerNormOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { +#if IS_TRT_VERSION_GE(7000) + VLOG(4) << "convert fused preln_skip_layernorm op to tensorrt layer"; + if (!(engine_->use_oss() && engine_->with_interleaved())) { + PADDLE_THROW(platform::errors::Fatal( + "PrelnErnie: If you want to use oss, must be with interleaved")); + } + framework::OpDesc op_desc(op, nullptr); + bool enable_int8 = op_desc.HasAttr("enable_int8"); + if (!enable_int8) { + PADDLE_THROW( + platform::errors::Fatal("use with_interleaved must be int8.")); + } + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); + std::vector inputs; + inputs.push_back(input1); + inputs.push_back(input2); + + auto get_persistable_data = [&](const std::string& arg_name, + framework::DDim* dims) -> float* { + std::string var_name = op_desc.Input(arg_name).front(); + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + (*dims) = temp_tensor->dims(); + + auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false); + return temp_data; + }; + + framework::DDim bias_dims, scale_dims; + auto* bias = get_persistable_data("Bias", &bias_dims); + auto* scale = get_persistable_data("Scale", &scale_dims); + int bias_size = framework::product(bias_dims); + int scale_size = framework::product(scale_dims); + + nvinfer1::ILayer* layer = nullptr; + + VLOG(4) << "fused preln_skip_layernorm op: use_oss and with_interleaved"; + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomSkipLayerNormPluginDynamic", "4"); + PADDLE_ENFORCE_NE( + creator, nullptr, + platform::errors::InvalidArgument( + "fail to get creator of CustomPrelnSkipLayerNormPluginDynamic")); + const std::vector fields{ + {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, + { "gamma", + scale, + nvinfer1::PluginFieldType::kFLOAT32, + scale_size }}; + nvinfer1::PluginFieldCollection* pluginPtr = + static_cast( + malloc(sizeof(*pluginPtr) + + fields.size() * sizeof(nvinfer1::PluginField))); + pluginPtr->nbFields = static_cast(fields.size()); + pluginPtr->fields = fields.data(); + + auto pluginObj = + creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr); + auto plugin_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *pluginObj); + + PADDLE_ENFORCE_NE( + plugin_layer, nullptr, + platform::errors::InvalidArgument( + "fail to add CustomPrelnSkipLayerNormPluginDynamic layer")); + layer = plugin_layer; + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name}, + test_mode); +#else + PADDLE_THROW(platform::errors::Fatal( + "PreInErnie want to use oss, must be with interleaved, " + "your TRT version is no less than 7.0")); +#endif + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(preln_skip_layernorm, PrelnSkipLayerNormOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc index 17adf957f64a76a010da6160479be2125d9deac9..d14317712b579b8f04889c3a18e4231d96513225 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -103,5 +103,5 @@ TEST(elementwise_op, plugin) { } // namespace inference } // namespace paddle -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(elementwise_mul); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 767672007dfef67ecc4424fa6c962832599b0182..799c6c55bb121778cfe3b1a39f2dc1af315236dd 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -30,24 +30,6 @@ namespace tensorrt { // Just tell by the op_types. struct SimpleOpTypeSetTeller : public Teller { SimpleOpTypeSetTeller() { -#if IS_TRT_VERSION_GE(5130) - teller_set.insert("relu6"); - teller_set.insert("hard_sigmoid"); - teller_set.insert("clip"); - int8_teller_set.insert("relu6"); - int8_teller_set.insert("hard_sigmoid"); - int8_teller_set.insert("clip"); -#endif -#if IS_TRT_VERSION_GE(6000) - teller_set.insert("fused_embedding_eltwise_layernorm"); - teller_set.insert("multihead_matmul"); - teller_set.insert("skip_layernorm"); - teller_set.insert("slice"); - int8_teller_set.insert("fused_embedding_eltwise_layernorm"); - int8_teller_set.insert("multihead_matmul"); - int8_teller_set.insert("skip_layernorm"); - int8_teller_set.insert("slice"); -#endif // TODO(baoachun) The group_norm trt plugin will check input's dim // not -1 failed when dynamic shape mode. // #if IS_TRT_VERSION_GE(7130) @@ -76,104 +58,124 @@ struct SimpleOpTypeSetTeller : public Teller { private: // use this set for no calib int8. - std::unordered_set int8_teller_set{"mul", - "matmul", - "conv2d", - "conv2d_fusion", - "pool2d", - "relu", - "softmax", - "sigmoid", - "hard_swish", - "depthwise_conv2d", - "batch_norm", - "concat", - "tanh", - "pad", - "elementwise_add", - "elementwise_mul", - "dropout", - "prelu", - "conv2d_transpose", - "depthwise_conv2d_transpose", - "leaky_relu", - "fc", - "shuffle_channel", - "swish", - "split", - "instance_norm", - "gelu", - "layer_norm", - "scale", - "stack", - "transpose2", - "transpose", - "flatten2", - "flatten", - "gather", - "gather_nd", - "yolo_box", - "roi_align", - "affine_channel", - "nearest_interp", - "anchor_generator", - "reduce_sum", - "reduce_mean", - "conv3d", - "conv3d_transpose", - "mish", - "nearest_interp_v2", - "pool3d", - "deformable_conv"}; - std::unordered_set teller_set{"mul", - "matmul", - "conv2d", - "conv2d_fusion", - "pool2d", - "relu", - "softmax", - "sigmoid", - "hard_swish", - "depthwise_conv2d", - "batch_norm", - "concat", - "tanh", - "pad", - "elementwise_add", - "elementwise_mul", - "dropout", - "prelu", - "conv2d_transpose", - "depthwise_conv2d_transpose", - "leaky_relu", - "fc", - "shuffle_channel", - "swish", - "split", - "instance_norm", - "gelu", - "layer_norm", - "scale", - "stack", - "transpose2", - "transpose", - "flatten2", - "flatten", - "gather", - "gather_nd", - "yolo_box", - "roi_align", - "affine_channel", - "nearest_interp", - "anchor_generator", - "reduce_sum", - "reduce_mean", - "conv3d", - "conv3d_transpose", - "mish", - "nearest_interp_v2", - "pool3d", - "deformable_conv"}; + std::unordered_set int8_teller_set{ + "mul", + "matmul", + "conv2d", + "conv2d_fusion", + "pool2d", + "relu", + "softmax", + "sigmoid", + "hard_swish", + "depthwise_conv2d", + "batch_norm", + "concat", + "tanh", + "pad", + "elementwise_add", + "elementwise_mul", + "dropout", + "prelu", + "conv2d_transpose", + "depthwise_conv2d_transpose", + "leaky_relu", + "fc", + "shuffle_channel", + "swish", + "split", + "instance_norm", + "gelu", + "layer_norm", + "scale", + "stack", + "transpose2", + "transpose", + "flatten2", + "flatten", + "gather", + "gather_nd", + "yolo_box", + "roi_align", + "affine_channel", + "nearest_interp", + "anchor_generator", + "reduce_sum", + "reduce_mean", + "conv3d", + "conv3d_transpose", + "mish", + "nearest_interp_v2", + "pool3d", + "deformable_conv", + "relu6", + "hard_sigmoid", + "clip", + "fused_embedding_eltwise_layernorm", + "multihead_matmul", + "skip_layernorm", + "slice", + "fused_preln_embedding_eltwise_layernorm", + "preln_skip_layernorm"}; + std::unordered_set teller_set{ + "mul", + "matmul", + "conv2d", + "conv2d_fusion", + "pool2d", + "relu", + "softmax", + "sigmoid", + "hard_swish", + "depthwise_conv2d", + "batch_norm", + "concat", + "tanh", + "pad", + "elementwise_add", + "elementwise_mul", + "dropout", + "prelu", + "conv2d_transpose", + "depthwise_conv2d_transpose", + "leaky_relu", + "fc", + "shuffle_channel", + "swish", + "split", + "instance_norm", + "gelu", + "layer_norm", + "scale", + "stack", + "transpose2", + "transpose", + "flatten2", + "flatten", + "gather", + "gather_nd", + "yolo_box", + "roi_align", + "affine_channel", + "nearest_interp", + "anchor_generator", + "reduce_sum", + "reduce_mean", + "conv3d", + "conv3d_transpose", + "mish", + "nearest_interp_v2", + "pool3d", + "deformable_conv", + "relu6", + "hard_sigmoid", + "clip", + "fused_embedding_eltwise_layernorm", + "multihead_matmul", + "skip_layernorm", + "slice", + "fused_preln_embedding_eltwise_layernorm", + "preln_skip_layernorm"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -1007,6 +1009,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "fused_preln_embedding_eltwise_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) + << "fused_preln_embedding_eltwise_layernorm should run on dynamic " + "shape mode."; + return false; + } + if (desc.Input("Ids").size() != desc.Input("Embs").size()) { + VLOG(3) << "The id and emb size of fused PrelnEmbEltwiseLayerNormOp " + "should be same "; + return false; + } + if (!desc.HasAttr("enable_int8")) { + VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode."; + return false; + } + } + if (op_type == "gelu") { if (desc.Input("X").size() != 1) { VLOG(3) << "gelu op has only 1 input, but got " @@ -1019,9 +1039,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } +#if IS_TRT_VERSION_LT(7000) if (desc.HasAttr("approximate")) { + VLOG(3) << "approximate gelu op needs TensorRT 7.0 and after"; if (BOOST_GET_CONST(bool, desc.GetAttr("approximate"))) return false; } +#endif auto* block = desc.Block(); if (block == nullptr) { @@ -1030,6 +1053,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, "the pass."; return false; } + auto x_var_name = desc.Input("X")[0]; auto* x_var_desc = block->FindVar(x_var_name); const auto x_shape = x_var_desc->GetShape(); @@ -1312,6 +1336,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "preln_skip_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "the preln_skip_layernorm does not support static shape yet"; + return false; + } + if (!desc.HasAttr("enable_int8")) { + VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode."; + return false; + } + } + if (op_type == "multihead_matmul") { if (!with_dynamic_shape) { VLOG(3) << "the multihead_matmul does not support static shape yet"; diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 21c8812f3789e37a68ba75be68c296a8bc214511..6d711c26adc6ff8e49375d15f32322303f3ae6ef 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -35,6 +35,36 @@ nvinfer1::Dims PoolPlugin::getOutputDimensions(int index, return output_dims; } +size_t PoolPlugin::getSerializationSize() const TRT_NOEXCEPT { + return getBaseSerializationSize() + SerializedSize(ceil_mode_) + + SerializedSize(pool_type_) + SerializedSize(adaptive_) + + SerializedSize(exclusive_) + SerializedSize(ksize_) + + SerializedSize(strides_) + SerializedSize(paddings_) + + SerializedSize(real_paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_); +} + +// TRT will call this func when we need to serialize the configuration of +// tensorrt. +void PoolPlugin::serialize(void *buffer) const TRT_NOEXCEPT { + serializeBase(buffer); + SerializeValue(&buffer, ceil_mode_); + SerializeValue(&buffer, pool_type_); + SerializeValue(&buffer, adaptive_); + SerializeValue(&buffer, exclusive_); + SerializeValue(&buffer, ksize_); + SerializeValue(&buffer, strides_); + SerializeValue(&buffer, paddings_); + SerializeValue(&buffer, real_paddings_); + SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); +} + +PoolPlugin *PoolPlugin::clone() const TRT_NOEXCEPT { + return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, exclusive_, ksize_, + strides_, paddings_, input_shape_, real_paddings_); +} + int PoolPlugin::enqueue(int batchSize, const void *const *inputs, #if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, @@ -59,14 +89,15 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, paddle::operators::math::MaxPool, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, - paddings_, true, adaptive_, odatas[0], stream, pool_process); + paddings_, true, false, odatas[0], stream, pool_process); } else if (pool_type_ == PoolType::avg) { paddle::operators::math::AvgPool pool_process; paddle::operators::math::Pool2dDirectCUDAFunctor< paddle::operators::math::AvgPool, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, - paddings_, true, adaptive_, odatas[0], stream, pool_process); + paddings_, exclusive_, adaptive_, odatas[0], stream, + pool_process); } return cudaGetLastError() != cudaSuccess; @@ -82,6 +113,7 @@ PoolPluginDynamic::PoolPluginDynamic(void const *serialData, DeserializeValue(&serialData, &serialLength, &pool_type); pool_type_ = std::string(pool_type); DeserializeValue(&serialData, &serialLength, &adaptive_); + DeserializeValue(&serialData, &serialLength, &exclusive_); DeserializeValue(&serialData, &serialLength, &ksize_); DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &paddings_); @@ -90,21 +122,27 @@ PoolPluginDynamic::PoolPluginDynamic(void const *serialData, size_t PoolPluginDynamic::getSerializationSize() const TRT_NOEXCEPT { return SerializedSize(ceil_mode_) + SerializedSize(pool_type_.c_str()) + - SerializedSize(adaptive_) + SerializedSize(ksize_) + - SerializedSize(strides_) + SerializedSize(paddings_) + - SerializedSize(is_global_); + SerializedSize(adaptive_) + SerializedSize(exclusive_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(is_global_); } void PoolPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT { SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, pool_type_.c_str()); SerializeValue(&buffer, adaptive_); + SerializeValue(&buffer, exclusive_); SerializeValue(&buffer, ksize_); SerializeValue(&buffer, strides_); SerializeValue(&buffer, paddings_); SerializeValue(&buffer, is_global_); } +nvinfer1::IPluginV2DynamicExt *PoolPluginDynamic::clone() const TRT_NOEXCEPT { + return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, exclusive_, + ksize_, strides_, paddings_, is_global_); +} + nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT { @@ -117,11 +155,14 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions( platform::errors::InvalidArgument("The channel dimension should be " "static, but we found it's dynamic.")); nvinfer1::DimsExprs output(inputs[0]); - if (is_global_) { + if (is_global_ && !adaptive_) { output.d[2] = expr_builder.constant(1); output.d[3] = expr_builder.constant(1); return output; } + if (is_global_ && adaptive_) { + return inputs[0]; + } if (adaptive_) { output.d[2] = expr_builder.constant(ksize_[0]); output.d[3] = expr_builder.constant(ksize_[1]); @@ -245,6 +286,10 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, output_shape[2] = data_dim[0]; output_shape[3] = data_dim[1]; } + if (adaptive_) { + output_shape[2] = h; + output_shape[3] = w; + } if (pool_type_ == "max") { paddle::operators::math::MaxPool pool_process; @@ -252,14 +297,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, paddle::operators::math::MaxPool, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, - true, adaptive_, output, stream, pool_process); + true, false, output, stream, pool_process); } else if (pool_type_ == "avg") { paddle::operators::math::AvgPool pool_process; paddle::operators::math::Pool2dDirectCUDAFunctor< paddle::operators::math::AvgPool, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, - true, adaptive_, output, stream, pool_process); + exclusive_, adaptive_, output, stream, pool_process); } return cudaGetLastError() != cudaSuccess; diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h index 6ced066a35952f5046ca9f3dd5fb83d860086001..d1bf2cd02e84f3cff3f61702160fcfa7e53f023f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h @@ -29,26 +29,32 @@ static std::vector CalcOutputSize(const std::vector& input_shape, const bool& adaptive, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings) { + const std::vector& real_paddings) { std::vector output_shape = input_shape; if (adaptive) { output_shape[0] = ksize[0]; output_shape[1] = ksize[1]; } else { - int output_h, output_w; - if (!ceil_mode) { - output_h = (input_shape[0] - ksize[0] + 2 * paddings[0]) / strides[0] + 1; - output_w = (input_shape[1] - ksize[1] + 2 * paddings[1]) / strides[1] + 1; - } else { - output_h = - (input_shape[0] - ksize[0] + 2 * paddings[0] + strides[0] - 1) / - strides[0] + - 1; - output_w = - (input_shape[1] - ksize[1] + 2 * paddings[1] + strides[1] - 1) / - strides[1] + - 1; + int output_h = 0, output_w = 0; + if (ceil_mode) { + output_h = (input_shape[0] - ksize[0] + real_paddings[0] + + real_paddings[1] + strides[0] - 1) / + strides[0] + + 1; + output_w = (input_shape[1] - ksize[1] + real_paddings[2] + + real_paddings[3] + strides[1] - 1) / + strides[1] + + 1; } + // TRT will use native layer when ceil_model=false + /* + else{ + output_h = (input_shape[0] - ksize[0] + real_paddings[0] + + real_paddings[1]) / strides[0] + 1; + output_w = (input_shape[1] - ksize[1] + real_paddings[2] + + real_paddings[3]) / strides[1] + 1; + } + */ output_shape[0] = output_h; output_shape[1] = output_w; } @@ -57,47 +63,32 @@ static std::vector CalcOutputSize(const std::vector& input_shape, class PoolPlugin : public PluginTensorRT { public: - size_t getSerializationSize() const TRT_NOEXCEPT override { - return getBaseSerializationSize() + SerializedSize(ceil_mode_) + - SerializedSize(pool_type_) + SerializedSize(adaptive_) + - SerializedSize(ksize_) + SerializedSize(strides_) + - SerializedSize(paddings_) + SerializedSize(input_shape_) + - SerializedSize(output_shape_); - } + size_t getSerializationSize() const TRT_NOEXCEPT override; - // TRT will call this func when we need to serialize the configuration of - // tensorrt. - void serialize(void* buffer) const TRT_NOEXCEPT override { - serializeBase(buffer); - SerializeValue(&buffer, ceil_mode_); - SerializeValue(&buffer, pool_type_); - SerializeValue(&buffer, adaptive_); - SerializeValue(&buffer, ksize_); - SerializeValue(&buffer, strides_); - SerializeValue(&buffer, paddings_); - SerializeValue(&buffer, input_shape_); - SerializeValue(&buffer, output_shape_); - } + void serialize(void* buffer) const TRT_NOEXCEPT override; enum class PoolType { max = 0, avg, }; PoolPlugin() {} - PoolPlugin(bool ceil_mode, PoolType pool_type, bool adaptive, + PoolPlugin(bool ceil_mode, PoolType pool_type, bool adaptive, bool exclusive, std::vector ksize, std::vector strides, - std::vector paddings, std::vector input_shape) + std::vector paddings, std::vector input_shape, + std::vector real_paddings) : ceil_mode_(ceil_mode), pool_type_(pool_type), adaptive_(adaptive), + exclusive_(exclusive), ksize_(ksize), strides_(strides), paddings_(paddings), + real_paddings_(real_paddings), input_shape_(input_shape) { output_shape_ = input_shape_; std::vector output_shape = CalcOutputSize({input_shape_[1], input_shape_[2]}, ceil_mode_, - adaptive_, ksize_, strides_, paddings_); + adaptive_, ksize_, strides_, real_paddings_); output_shape_[1] = output_shape[0]; output_shape_[2] = output_shape[1]; } @@ -109,17 +100,16 @@ class PoolPlugin : public PluginTensorRT { DeserializeValue(&serialData, &serialLength, &ceil_mode_); DeserializeValue(&serialData, &serialLength, &pool_type_); DeserializeValue(&serialData, &serialLength, &adaptive_); + DeserializeValue(&serialData, &serialLength, &exclusive_); DeserializeValue(&serialData, &serialLength, &ksize_); DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &paddings_); + DeserializeValue(&serialData, &serialLength, &real_paddings_); DeserializeValue(&serialData, &serialLength, &input_shape_); DeserializeValue(&serialData, &serialLength, &output_shape_); } - PoolPlugin* clone() const TRT_NOEXCEPT override { - return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, ksize_, strides_, - paddings_, input_shape_); - } + PoolPlugin* clone() const TRT_NOEXCEPT override; const char* getPluginType() const TRT_NOEXCEPT override { return "pool_plugin"; @@ -139,9 +129,11 @@ class PoolPlugin : public PluginTensorRT { bool ceil_mode_; PoolType pool_type_; bool adaptive_; + bool exclusive_; std::vector ksize_; std::vector strides_; std::vector paddings_; + std::vector real_paddings_; std::vector input_shape_; std::vector output_shape_; }; @@ -167,12 +159,14 @@ class PoolPluginDynamic : public DynamicPluginTensorRT { public: PoolPluginDynamic() {} PoolPluginDynamic(const bool& ceil_mode, const std::string& pool_type, - const bool& adaptive, const std::vector& ksize, + const bool& adaptive, bool exclusive, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, const bool& is_global) : ceil_mode_(ceil_mode), pool_type_(pool_type), adaptive_(adaptive), + exclusive_(exclusive), ksize_(ksize), strides_(strides), paddings_(paddings), @@ -180,10 +174,7 @@ class PoolPluginDynamic : public DynamicPluginTensorRT { PoolPluginDynamic(void const* serialData, size_t serialLength); ~PoolPluginDynamic() {} - nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { - return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, ksize_, - strides_, paddings_, is_global_); - } + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override; const char* getPluginType() const TRT_NOEXCEPT override { return "pool_plugin_dynamic"; @@ -229,6 +220,7 @@ class PoolPluginDynamic : public DynamicPluginTensorRT { bool ceil_mode_; std::string pool_type_; bool adaptive_; + bool exclusive_; std::vector ksize_; std::vector strides_; std::vector paddings_; diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index b899ddbcd5a4e30e065eb1969c41fde6046a8ea7..6cd7d87332323f4bafd49b8b16254f9610405658 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -58,6 +58,11 @@ else () set(AllocatorFacadeDeps) endif() +if (WITH_CUSTOM_DEVICE) + cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager) + set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator) +endif() + if (WITH_GPU) nv_test(best_fit_allocator_test SRCS best_fit_allocator_test.cc diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 35131446d8647e0581d2d997451017293b7ca8dc..fc34a64d62636cca3d274fb2294a5d9139ae5d77 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -62,6 +62,11 @@ #include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/memory/allocation/custom_allocator.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#endif + PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, "The retry time (milliseconds) when allocator fails " @@ -186,6 +191,17 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto& dev_type : device_types) { + for (size_t dev_id = 0; + dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + ++dev_id) { + InitNaiveBestFitCustomDeviceAllocator( + platform::CustomPlace(dev_type, dev_id)); + } + } #endif break; } @@ -222,6 +238,17 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto& dev_type : device_types) { + for (size_t dev_id = 0; + dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + ++dev_id) { + InitAutoGrowthCustomDeviceAllocator( + platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk); + } + } #endif break; } @@ -700,6 +727,21 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) { + allocators_[p] = std::make_shared(p); + } + + void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p, + bool allow_free_idle_chunk) { + auto custom_allocator = + std::make_shared(p); + allocators_[p] = std::make_shared( + custom_allocator, platform::DeviceManager::GetMinChunkSize(p), + allow_free_idle_chunk); + } +#endif + void InitSystemAllocators() { if (!system_allocators_.empty()) return; system_allocators_[platform::CPUPlace()] = std::make_shared(); @@ -770,6 +812,16 @@ class AllocatorFacadePrivate { places.emplace_back(platform::MLUPlace(dev_id)); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto& dev_type : device_types) { + for (size_t dev_id = 0; + dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id++) { + places.emplace_back(platform::CustomPlace(dev_type, dev_id)); + } + } +#endif for (auto& p : places) { zero_size_allocators_[p] = std::make_shared(p); @@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); } #endif - platform::CUDAPlace p(place.GetDeviceId()); if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb035ea5e3ad409777114cca44cd945ed4bd9541 --- /dev/null +++ b/paddle/fluid/memory/allocation/custom_allocator.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/custom_allocator.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool CustomAllocator::IsAllocThreadSafe() const { return true; } +void CustomAllocator::FreeImpl(pten::Allocation* allocation) { + PADDLE_ENFORCE_EQ( + allocation->place(), place_, + platform::errors::PermissionDenied("CustomDevice memory is " + "freed in incorrect device. " + "This may be a bug")); + + delete allocation; +} + +pten::Allocation* CustomAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::DeviceManager::SetDevice(place_); }); + + void* ptr = + platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); + if (LIKELY(ptr)) { + return new Allocation(ptr, size, place_); + } + + size_t avail, total; + platform::DeviceManager::MemoryStats(place_, &total, &avail); + + auto dev_type = platform::PlaceHelper::GetDeviceType(place_); + auto dev_id = platform::PlaceHelper::GetDeviceId(place_); + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on %s:%d. " + "Cannot allocate %s memory on %s:%d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using %s:%d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another %s.\n" + "2. If no, please decrease the batch size of your model.\n\n", + dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id, + string::HumanReadableSize(avail), dev_type, dev_id, dev_type)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..708c105a850087f49becde702590920a0f9afc9d --- /dev/null +++ b/paddle/fluid/memory/allocation/custom_allocator.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CustomAllocator : public Allocator { + public: + explicit CustomAllocator(const platform::CustomPlace& place) + : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; + + private: + platform::Place place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 91358b688040aa9789e3268eb0e29dc6790c0e13..b63f872141c802f512332750d36a3116df2c40c9 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -20,6 +20,7 @@ #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -30,7 +31,6 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif -#include "paddle/fluid/platform/device/device_wrapper.h" PADDLE_DEFINE_EXPORTED_bool( init_allocated_mem, false, @@ -733,6 +733,136 @@ uint64_t Release(const platform::MLUPlace &place) { #endif } +// For CustomDevice +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class BuddyAllocatorList { + private: + explicit BuddyAllocatorList(const std::string &device_type) + : device_type_(device_type) { + auto devices = platform::DeviceManager::GetDeviceList(device_type); + for (auto dev_id : devices) { + init_flags_[dev_id].reset(new std::once_flag()); + } + } + + static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) { + return new BuddyAllocatorList(device_type); + } + + public: + static BuddyAllocatorList *Instance(const std::string &device_type) { + // DeviceType -> AllocatorList + static std::unordered_map pool; + if (pool.find(device_type) == pool.end()) { + pool[device_type] = CreateNewInstance(device_type); + } + return pool[device_type]; + } + + BuddyAllocator *Get(int dev_id) { + PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(), + platform::errors::OutOfRange( + "Cannot find %s %d, please check visible devices.", + device_type_, dev_id)); + + std::call_once(*init_flags_[dev_id], [this, dev_id] { + platform::DeviceManager::SetDevice(device_type_, dev_id); + platform::CustomPlace place(device_type_, dev_id); + + allocators_[dev_id].reset(new BuddyAllocator( + std::unique_ptr( + new detail::CustomAllocator(device_type_, dev_id)), + platform::DeviceManager::GetMinChunkSize(place), + platform::DeviceManager::GetMaxChunkSize(place), + platform::DeviceManager::GetExtraPaddingSize(place), device_type_)); + }); + + return allocators_[dev_id].get(); + } + + private: + std::string device_type_; + std::unordered_map> init_flags_; + std::unordered_map> allocators_; +}; + +BuddyAllocator *GetBuddyAllocator(const platform::Place &place) { + VLOG(10) << "GetBuddyAllocator place = " << place; + if (platform::is_custom_place(place)) { + return BuddyAllocatorList::Instance( + platform::PlaceHelper::GetDeviceType(place)) + ->Get(platform::PlaceHelper::GetDeviceId(place)); + } else { + PADDLE_THROW( + platform::errors::InvalidArgument("place must be CustomPlace")); + } +} +#endif + +template <> +void *Alloc(const platform::CustomPlace &place, + size_t size) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + auto *buddy_allocator = GetBuddyAllocator(place); + auto *ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + platform::DeviceGuard guard(place); + size_t avail, total; + platform::DeviceManager::MemoryStats(place, &total, &avail); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " + "%s. ", + string::HumanReadableSize(size), place.GetDeviceType(), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(total - avail))); + } else { + if (FLAGS_init_allocated_mem) { + platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, + size); + } + } + VLOG(10) << " pointer=" << ptr; + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::CustomPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetBuddyAllocator(place)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::CustomPlace &place) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + return GetBuddyAllocator(place)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + +template <> +size_t Used(const platform::CustomPlace &place) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + return GetBuddyAllocator(place)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index b02fb6642be3fd4ade7dc1b4ed7642be28cc7757..d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -25,9 +25,7 @@ limitations under the License. */ DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif -#ifdef PADDLE_WITH_MLU -#include "paddle/fluid/platform/device/mlu/mlu_info.h" -#endif +#include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace memory { @@ -35,12 +33,37 @@ namespace detail { BuddyAllocator::BuddyAllocator( std::unique_ptr system_allocator, size_t min_chunk_size, - size_t max_chunk_size, size_t extra_padding_size) + size_t max_chunk_size, size_t extra_padding_size, + const std::string dev_type) : min_chunk_size_(min_chunk_size), max_chunk_size_(max_chunk_size), extra_padding_size_(extra_padding_size), cache_(system_allocator->UseGpu()), - system_allocator_(std::move(system_allocator)) {} + system_allocator_(std::move(system_allocator)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (!dev_type.empty()) { + init_allocate_size_func_ = [dev_type]() { + return platform::DeviceManager::GetInitAllocSize( + platform::PlaceHelper::CreatePlace(dev_type)); + }; + re_allocate_size_func_ = [dev_type]() { + return platform::DeviceManager::GetReallocSize( + platform::PlaceHelper::CreatePlace(dev_type)); + }; + } else { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + init_allocate_size_func_ = &platform::GpuInitAllocSize; + re_allocate_size_func_ = &platform::GpuReallocSize; +#elif defined(PADDLE_WITH_ASCEND_CL) + init_allocate_size_func_ = &platform::NPUInitAllocSize; + re_allocate_size_func_ = &platform::NPUReallocSize; +#elif defined(PADDLE_WITH_MLU) + init_allocate_size_func_ = &platform::MLUInitAllocSize; + re_allocate_size_func_ = &platform::MLUReallocSize; +#endif + } +#endif +} BuddyAllocator::~BuddyAllocator() { VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " @@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( size_t allocate_bytes = max_chunk_size_; size_t index = 0; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + allocate_bytes = DeviceAllocateSize(init_allocate_size_func_, + re_allocate_size_func_, request_bytes); +#else #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); @@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( #elif defined(PADDLE_WITH_MLU) allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes); +#endif #endif // Allocate a new block diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 0d736f680503a6ce59e88142a9eec2ad4ebfdd26..5296192b8fd9b632be4638d47153e113fd2ae576 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -39,7 +39,8 @@ class BuddyAllocator { public: BuddyAllocator(std::unique_ptr system_allocator, size_t min_chunk_size, size_t max_chunk_size, - size_t extra_padding_size = 0); + size_t extra_padding_size = 0, + const std::string dev_type = ""); ~BuddyAllocator(); @@ -123,6 +124,9 @@ class BuddyAllocator { /*! Allocate CPU/GPU memory from system */ std::unique_ptr system_allocator_; std::mutex mutex_; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + std::function init_allocate_size_func_, re_allocate_size_func_; +#endif }; } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 773122de6c3198b09c33241a0d6a09e9357f65a3..a61f98c4e1a22adcc3684a9e5af190a82e3b5110 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -38,6 +38,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "paddle/fluid/platform/device/device_wrapper.h" + DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); @@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) { bool MLUAllocator::UseGpu() const { return true; } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +void* CustomAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + auto place = platform::CustomPlace(dev_type_, dev_id_); + auto device = platform::DeviceManager::GetDeviceWithPlace(place); + p = device->MemoryAllocate(size); + if (LIKELY(p)) { + VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size; + *index = 0; + plug_alloc_size += size; + } else { + size_t avail, total; + + platform::DeviceManager::MemoryStats(place, &total, &avail); + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on %s %d. " + "total memory is %s, used memory is %s, " + "available memory is only %s.\n\n", + dev_type_, dev_id_, string::HumanReadableSize(total), + string::HumanReadableSize(total - avail), + string::HumanReadableSize(avail))); + } + return p; +} + +void CustomAllocator::Free(void* p, size_t size, size_t index) { + VLOG(4) << "CustomAllocator::Free " << p << " size " << size; + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(plug_alloc_size, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, plug_alloc_size)); + plug_alloc_size -= size; + auto place = platform::CustomPlace(dev_type_, dev_id_); + auto device = platform::DeviceManager::GetDeviceWithPlace(place); + device->MemoryDeallocate(p, size); +} + +bool CustomAllocator::UseGpu() const { return true; } +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index 975e2891b2472ad4aeb5c4a7d6f676c516350545..f6ff6282a614a3152dee5bd0e45ebe3b733fe14f 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include // for size_t +#include namespace paddle { namespace memory { @@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator { }; #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class CustomAllocator : public SystemAllocator { + public: + explicit CustomAllocator(const std::string& device_type, size_t dev_id) + : dev_type_(device_type), dev_id_(dev_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t plug_alloc_size = 0; + std::string dev_type_; + size_t dev_id_; +}; +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index d2ab438fd2946701c70ea0bebf35ac33fbfb521e..d857b1c1671a789fa122a1d4115461fc0b5ba840 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -19,9 +19,88 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/pten/common/place.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + namespace paddle { namespace memory { +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place, + const void* src, size_t num, void* stream) { + if (UNLIKELY(num == 0)) return; + + auto src_type = platform::PlaceHelper::GetDeviceType(src_place); + auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); + std::string msg = "Memcpy:" + src_type + "->" + dst_type; + platform::RecordEvent record_event(msg); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << ", stream=" << stream; + + platform::DeviceManager::SetDevice(src_place); + platform::stream::Stream stream_wrapper(src_place, stream); + platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( + dst, src, num, &stream_wrapper); +} + +template <> +void Copy( + platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, void* stream) { + if (UNLIKELY(num == 0)) return; + auto src_type = platform::PlaceHelper::GetDeviceType(src_place); + auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); + std::string msg = "Memcpy:" + src_type + "->" + dst_type; + platform::RecordEvent record_event(msg); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << ", stream=" << stream; + + platform::DeviceManager::SetDevice(dst_place); + platform::stream::Stream stream_wrapper(dst_place, stream); + platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( + dst, src, num, &stream_wrapper); +} + +template <> +void Copy( + platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place, + const void* src, size_t num, void* stream) { + if (UNLIKELY(num == 0)) return; + + auto src_type = platform::PlaceHelper::GetDeviceType(src_place); + auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); + std::string msg = "Memcpy:" + src_type + "->" + dst_type; + platform::RecordEvent record_event(msg); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << ", stream=" << stream; + + if (src_type == dst_type) { + platform::DeviceManager::SetDevice(src_place); + platform::stream::Stream stream_wrapper(src_place, stream); + + auto src_id = platform::PlaceHelper::GetDeviceId(src_place); + auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place); + if (src_id == dst_id) { + platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( + dst, src, num, &stream_wrapper); + } else { + platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( + dst_place, dst, src, num, &stream_wrapper); + } + } else { + PADDLE_THROW(platform::errors::Unavailable( + "Copy between %s and %s is not supported.", src_type, dst_type)); + } +} +#endif // PADDLE_WITH_CUSTOM_DEVICE + template <> void Copy(platform::CPUPlace, void* dst, platform::CPUPlace, @@ -158,7 +237,7 @@ void Copy(platform::NPUPlace dst_place, void* dst, platform::CPUPlace src_place, const void* src, size_t num, - aclrtStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -168,7 +247,8 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation // after async is not ok, since the async operation may not done. @@ -186,7 +266,7 @@ void Copy(platform::CPUPlace dst_place, void* dst, platform::NPUPlace src_place, const void* src, size_t num, - aclrtStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -196,7 +276,8 @@ void Copy(platform::CPUPlace dst_place, if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -211,7 +292,7 @@ void Copy(platform::NPUPlace dst_place, void* dst, platform::NPUPlace src_place, const void* src, size_t num, - aclrtStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -221,7 +302,7 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, - stream); + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -239,7 +320,7 @@ void Copy(platform::NPUPlace dst_place, // TODO(zhiqiu): support peer access? platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, - stream); + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -284,7 +365,7 @@ void Copy( template <> void Copy( platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, aclrtStream stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -294,7 +375,8 @@ void Copy( if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -307,7 +389,7 @@ void Copy( template <> void Copy( platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num, aclrtStream stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -317,7 +399,8 @@ void Copy( if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation // after async is not ok, since the async operation may not done. @@ -379,6 +462,23 @@ void Copy(pten::Place dst_place, void* dst, platform::NPUPinnedPlace place_dst; platform::NPUPlace place_src(src_place.GetDeviceId()); return Copy(place_dst, dst, place_src, src, num, stream); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); +#endif } } @@ -492,7 +592,7 @@ inline void SyncCUDAStream() { template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, gpuStream_t stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); @@ -501,9 +601,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); @@ -522,7 +624,7 @@ void Copy( template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, gpuStream_t stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -531,9 +633,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); @@ -552,7 +656,7 @@ void Copy( template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, gpuStream_t stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -562,9 +666,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); @@ -578,7 +684,7 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, stream); + num, reinterpret_cast(stream)); } else { platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, @@ -620,8 +726,7 @@ void Copy( template <> void Copy( platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + platform::CUDAPlace src_place, const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -629,9 +734,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); @@ -647,7 +754,7 @@ template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CUDAPinnedPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -656,9 +763,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); @@ -674,7 +783,7 @@ void Copy( template <> void Copy(pten::Place dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, gpuStream_t stream) { + size_t num, void* stream) { if (src_place.GetType() == pten::AllocationType::CPU && dst_place.GetType() == pten::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -719,6 +828,23 @@ void Copy(pten::Place dst_place, void* dst, platform::CUDAPinnedPlace place_dst; platform::CUDAPlace place_src(src_place.GetDeviceId()); return Copy(place_dst, dst, place_src, src, num, stream); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); +#endif } } @@ -726,7 +852,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::CPUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, gpuStream_t stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); } @@ -735,7 +861,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::CPUPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); } @@ -743,7 +869,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::GPUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, gpuStream_t stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, src, num, stream); } @@ -753,7 +879,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::GPUPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, stream); @@ -764,7 +890,7 @@ template <> void Copy(pten::GPUPinnedPlace dst_place, void* dst, pten::Place src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); } @@ -773,7 +899,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::GPUPinnedPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); } @@ -800,7 +926,7 @@ void Copy(platform::CPUPlace dst_place, void* dst, platform::MLUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetMLUDeviceId(src_place.device); @@ -808,7 +934,8 @@ void Copy(platform::CPUPlace dst_place, VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); - platform::MLUMemcpyD2HAsync(dst, src, num, stream); + platform::MLUMemcpyD2HAsync(dst, src, num, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -825,7 +952,7 @@ void Copy(platform::MLUPlace dst_place, void* dst, platform::CPUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetMLUDeviceId(dst_place.device); @@ -833,7 +960,8 @@ void Copy(platform::MLUPlace dst_place, VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); - platform::MLUMemcpyH2DAsync(dst, src, num, stream); + platform::MLUMemcpyH2DAsync(dst, src, num, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -850,7 +978,7 @@ void Copy(platform::MLUPlace dst_place, void* dst, platform::MLUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; if (dst_place == src_place) { @@ -860,7 +988,8 @@ void Copy(platform::MLUPlace dst_place, << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event( "MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); - platform::MLUMemcpyD2DAsync(dst, src, num, stream); + platform::MLUMemcpyD2DAsync(dst, src, num, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -877,7 +1006,7 @@ void Copy(platform::MLUPlace dst_place, << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU"); platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, stream); + num, reinterpret_cast(stream)); } else { VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; @@ -892,7 +1021,7 @@ void Copy(platform::MLUPlace dst_place, template <> void Copy(pten::Place dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, mluStream stream) { + size_t num, void* stream) { if (src_place.GetType() == pten::AllocationType::CPU && dst_place.GetType() == pten::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -912,6 +1041,23 @@ void Copy(pten::Place dst_place, void* dst, platform::MLUPlace place_src(src_place.GetDeviceId()); platform::MLUPlace place_dst(dst_place.GetDeviceId()); return Copy(place_dst, dst, place_src, src, num, stream); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); +#endif } } @@ -919,7 +1065,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::MLUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, mluStream stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, src, num, stream); } @@ -929,7 +1075,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::MLUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, stream); @@ -939,7 +1085,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::CPUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, mluStream stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); } @@ -948,7 +1094,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::CPUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); } @@ -1013,7 +1159,7 @@ void Copy(pten::Place dst_place, void* dst, } #endif #ifdef PADDLE_WITH_IPU - else if (src_place.GetType() == pten::AllocationType::CPU && + else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT dst_place.GetType() == pten::AllocationType::IPU) { platform::IPUPlace place_dst(dst_place.GetDeviceId()); platform::CPUPlace place_src; @@ -1048,5 +1194,48 @@ void Copy(pten::CPUPlace dst_place, void* dst, Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num); } +#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MLU) + +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, void* stream) { + if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } +} + +template <> +void Copy(pten::CPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, void* stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::CPUPlace src_place, + const void* src, size_t num, + void* stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} +#endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 31d1a50e778f8c86400163a774af6dc04dce10ed..dd861a15b5c7b03e932eff8747668268b14618ef 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -36,66 +36,25 @@ namespace memory { template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -/** - * \brief Copy memory from one place to another place. - * - * \param[in] DstPlace Destination allocation place (CPU or GPU). - * \param[in] dst Destination memory address. - * \param[in] SrcPlace Source allocation place (CPU or GPU). - * \param[in] src Source memory address. - * \param[in] num memory size in bytes to copy. - * \param[in] stream CUDA stream. - * - * \note For GPU memory copy, CUDA stream need to be specified - * for asynchronously memory copy. - * - */ -template -void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, - gpuStream_t stream); -#endif - -#ifdef PADDLE_WITH_ASCEND_CL -/** - * \brief Copy memory from one place to another place. - * - * \param[in] DstPlace Destination allocation place (CPU or NPU). - * \param[in] dst Destination memory address. - * \param[in] SrcPlace Source allocation place (CPU or NPU). - * \param[in] src Source memory address. - * \param[in] num memory size in bytes to copy. - * \param[in] stream NPU stream. - * - * \note For NPU memory copy, NPU stream need to be specified - * for asynchronously memory copy. - * - */ -template -void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, - aclrtStream stream); -#endif - -#ifdef PADDLE_WITH_MLU /** * \brief Copy memory from one place to another place. * - * \param[in] DstPlace Destination allocation place (CPU or MLU). + * \param[in] DstPlace Destination allocation place (CPU or GPU or XPU or + * CustomDevice). * \param[in] dst Destination memory address. - * \param[in] SrcPlace Source allocation place (CPU or MLU). + * \param[in] SrcPlace Source allocation place (CPU or GPU or XPU or + * CustomDevice). * \param[in] src Source memory address. * \param[in] num memory size in bytes to copy. - * \param[in] stream MLU stream. + * \param[in] stream stream for asynchronously memory copy. * - * \note For MLU memory copy, MLU stream need to be specified - * for asynchronously memory copy. + * \note For GPU/XPU/CustomDevice memory copy, stream need to be specified + * for asynchronously memory copy, and type is restored in the + * implementation. * */ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, - mluStream stream); -#endif - + void* stream); } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b87cdf6f6df19314342a24c98032d3856f0d3779..a279c76430f1b046a4c3ca05485824d5e3b62de2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -1,5 +1,9 @@ include(operators) +# solve "math constants not defined" problems caused by the order of inclusion +# of and the definition of macro _USE_MATH_DEFINES +add_definitions(-D_USE_MATH_DEFINES) + # clean cache and pybind_file content first when rebuild unset(GLOB_OP_LIB CACHE) unset(OP_LIBRARY CACHE) diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index 5a09933e0ee24889f5192a8c84e449e09bdc147e..149a87fe32da16e850d5d64fb519c9bde7afef62 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/abs_op.h" - #include #include #include #include +#include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -108,7 +107,7 @@ class AbsDoubleGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr op) const override { - op->SetType("abs_grad_grad"); + op->SetType("abs_double_grad"); // input1: x op->SetInput("X", this->Input("X")); // input2: ddx @@ -159,37 +158,4 @@ REGISTER_OPERATOR(abs_grad, ops::AbsGradOp, ops::AbsDoubleGradMaker, ops::AbsDoubleGradMaker); -REGISTER_OPERATOR(abs_grad_grad, ops::AbsDoubleGradOp); - -REGISTER_OP_CPU_KERNEL( - abs, ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel>, - ops::AbsKernel>); - -REGISTER_OP_CPU_KERNEL( - abs_grad, ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel>, - ops::AbsGradKernel>); - -REGISTER_OP_CPU_KERNEL( - abs_grad_grad, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel>, - ops::AbsDoubleGradKernel>); +REGISTER_OPERATOR(abs_double_grad, ops::AbsDoubleGradOp); diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu deleted file mode 100644 index 882c8547a04154778389bd7cd77531b63d19915b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/abs_op.cu +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/abs_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -struct CudaAbsFunctor; - -template -struct CudaAbsFunctor>> { - __device__ __forceinline__ math::Real operator()(const T x) const { - return abs(x); - } -}; - -template -struct CudaAbsFunctor>> { - __device__ __forceinline__ T operator()(const T x) const { - return std::abs(x); - } -}; - -template -class AbsKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - out->mutable_data>(context.GetPlace()); - - auto& dev_ctx = - context.template device_context(); - std::vector ins = {x}; - std::vector outs = {out}; - auto functor = CudaAbsFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel>( - dev_ctx, ins, &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - abs, ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel>, - ops::AbsKernel>); - -REGISTER_OP_CUDA_KERNEL( - abs_grad, ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel>, - ops::AbsGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - abs_grad_grad, ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel>, - ops::AbsDoubleGradKernel>); diff --git a/paddle/fluid/operators/abs_op.h b/paddle/fluid/operators/abs_op.h deleted file mode 100644 index c79e83314f3bd39dcf6736e66c0b12956a2b0e81..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/abs_op.h +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -class AbsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - context.GetPlace(), size_t(x->numel() * sizeof(math::Real))); - - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - math::AbsFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class AbsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); - auto* x_data = x->data(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - math::AbsGradFunctor functor(dout_data, x_data, dx_data, numel); - for_range(functor); - } -}; - -template -class AbsDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* ddx = ctx.Input("DDX"); - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* ddout = ctx.Output("DDOut"); - - auto numel = ddx->numel(); - auto* ddx_data = ddx->data(); - auto* x_data = x->data(); - auto* ddout_data = ddout->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - math::AbsGradGradFunctor functor(ddx_data, x_data, ddout_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc index cc2b0925c21e527c6835822161f2dcfd959b1b2d..30ec22cf6d868381a4a78585dc3620a2ea78d466 100644 --- a/paddle/fluid/operators/abs_op_npu.cc +++ b/paddle/fluid/operators/abs_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include "paddle/fluid/operators/abs_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h index 093a04f03df95681d7837d5c44717c678589e679..1e0dc803d76123573332b040bc29ece263c11d80 100644 --- a/paddle/fluid/operators/angle_op.h +++ b/paddle/fluid/operators/angle_op.h @@ -17,7 +17,7 @@ #define _USE_MATH_DEFINES #endif #include -#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -26,81 +26,6 @@ namespace paddle { namespace operators { -namespace math { -template -struct AngleFunctor; - -// angel function for complex -template -struct AngleFunctor>> { - AngleFunctor(const T* input, Real* output, int64_t numel) - : input_(input), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = arg(input_[idx]); - } - - const T* input_; - Real* output_; - int64_t numel_; -}; - -// angel function for real -template -struct AngleFunctor>> { - AngleFunctor(const T* input, T* output, int64_t numel) - : input_(input), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = input_[idx] < static_cast(0) ? M_PI : 0; - } - - const T* input_; - T* output_; - int64_t numel_; -}; - -template -struct AngleGradFunctor; - -// angle grad for complex -template -struct AngleGradFunctor>> { - AngleGradFunctor(const math::Real* dout, const T* x, T* dx, int64_t numel) - : dout_(dout), x_(x), dx_(dx), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == T(0)) { - dx_[idx] = T(0); - } else { - const math::Real r_square = - x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag; - dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square, - dout_[idx] * x_[idx].real / r_square); - } - } - - const math::Real* dout_; - const T* x_; - T* dx_; - int64_t numel_; -}; - -// angle grad for real -template -struct AngleGradFunctor>> { - AngleGradFunctor(const math::Real* dout, const T* x, T* dx, int64_t numel) - : dout_(dout), x_(x), dx_(dx), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; } - - const math::Real* dout_; - const T* x_; - T* dx_; - int64_t numel_; -}; -} // namespace math - using Tensor = framework::Tensor; template class AngleKernel : public framework::OpKernel { @@ -111,12 +36,12 @@ class AngleKernel : public framework::OpKernel { auto numel = x->numel(); auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - context.GetPlace(), size_t(x->numel() * sizeof(math::Real))); + auto* out_data = out->mutable_data>( + context.GetPlace(), size_t(x->numel() * sizeof(pten::funcs::Real))); auto& dev_ctx = context.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::AngleFunctor functor(x_data, out_data, numel); + pten::funcs::AngleFunctor functor(x_data, out_data, numel); for_range(functor); } }; @@ -132,14 +57,14 @@ class AngleGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); + auto* dout_data = d_out->data>(); auto* x_data = x->data(); auto* dx_data = d_x->mutable_data( ctx.GetPlace(), static_cast(numel * sizeof(T))); auto& dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::AngleGradFunctor functor(dout_data, x_data, dx_data, numel); + pten::funcs::AngleGradFunctor functor(dout_data, x_data, dx_data, numel); for_range(functor); } }; diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h index f134bd0cd3c7a565019c92bf08ee4c565ba67ac5..565b1cee9f7852caa1d8de3d4bac67d6669b327a 100644 --- a/paddle/fluid/operators/center_loss_op.h +++ b/paddle/fluid/operators/center_loss_op.h @@ -20,8 +20,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/transform.h" + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h index 5004aad7c59bc4ad194bf961aaad6326ad03fd38..2c92969225f3bcbb8008c24c21e4a6f80dd03fd4 100644 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ b/paddle/fluid/operators/cholesky_solve_op.h @@ -64,7 +64,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, // calculate u's conjugate for complex framework::Tensor u_conj(u_bst.type()); platform::ForRange u_for_range(dev_ctx, u_bst.numel()); - math::ConjFunctor u_functor( + pten::funcs::ConjFunctor u_functor( u_bst.data(), u_bst.numel(), u_conj.mutable_data(u_bst.dims(), dev_ctx.GetPlace())); u_for_range(u_functor); @@ -73,7 +73,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, // calculate b's conjugate for complex framework::Tensor b_conj(b_bst.type()); platform::ForRange b_for_range(dev_ctx, b_bst.numel()); - math::ConjFunctor b_functor( + pten::funcs::ConjFunctor b_functor( b_bst.data(), b_bst.numel(), b_conj.mutable_data(b_bst.dims(), dev_ctx.GetPlace())); b_for_range(b_functor); @@ -113,7 +113,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, // calculate out's conjugate for complex platform::ForRange out_for_range(dev_ctx, out->numel()); - math::ConjFunctor out_functor( + pten::funcs::ConjFunctor out_functor( out->data(), out->numel(), out->mutable_data(out->dims(), dev_ctx.GetPlace())); out_for_range(out_functor); @@ -173,7 +173,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { // calculate out's conjugate for complex framework::Tensor out_conj(out->type()); platform::ForRange out_for_range(dev_ctx, out->numel()); - math::ConjFunctor out_functor( + pten::funcs::ConjFunctor out_functor( out->data(), out->numel(), out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); out_for_range(out_functor); @@ -195,7 +195,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { framework::Tensor commonterm_conj(commonterm.type()); platform::ForRange commonterm_for_range( dev_ctx, commonterm.numel()); - math::ConjFunctor commonterm_functor( + pten::funcs::ConjFunctor commonterm_functor( commonterm.data(), commonterm.numel(), commonterm_conj.mutable_data(commonterm.dims(), dev_ctx.GetPlace())); diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index ed3a7598bdab6b288d280c13af79f16ff0a84e46..b80916616a18b7521d6ae32711ca247fdfd3e403 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,8 +1,10 @@ include(operators) -register_operators(EXCLUDES cinn_launch_op) +cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn) -op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS string_helper cinn cinn_compiler cinn_launch_context) + +SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) +register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context) diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..edf854a9c95b088225ac0eb225f056f0c531c393 --- /dev/null +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/operators/cinn/cinn_launch_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle::operators { + +class CinnInstructionRunOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun"); + OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs, + "CinnInstructionRun"); + const CinnCompiledObject& compiled_object = + CinnCompiler::GetInstance()->GetCompiledObject( + ctx->Attrs().Get(kCachedIndex)); + + details::CinnLaunchContext* launch_context = + compiled_object.launch_context.get(); + std::vector output_args = ctx->Outputs(kOutputs); + std::vector output_dims(output_args.size()); + std::transform(output_args.begin(), output_args.end(), output_dims.begin(), + [launch_context](const std::string& var_name) { + cinn_buffer_t* buffer = + launch_context->GetCinnBufferOfVar(var_name); + return framework::DDim(buffer->dims, buffer->dimensions); + }); + ctx->SetOutputsDim(kOutputs, output_dims); + } +}; + +class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput(kX, + "(vector)" + "which are the input arguments of this cinn instruction") + .AsDuplicable(); + AddOutput(kOutputs, + "(vector)" + "which are the output arguments of this cinn instruction") + .AsDuplicable(); + AddAttr( + kCachedIndex, + "(int64_t)" + "the stored index of the cached compilation result in CinnCompiler," + "which is used to fetch the CinnCompiledObject where this cinn " + "instruction is included"); + AddAttr( + kInstructionIndex, + "(int64_t)" + "the index of this instruction to the cinn runtime program"); + AddComment(R"DOC( +CinnInstructionRun Operator. + +This operator is used to launch a +CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md) instruction execution + +Both the input and output of this operator are a set of variables +which are the input and output arguments of the bound cinn instruction respectively. +In addition, there is an attribute named 'cached_index' should be +set necessarily to get the CinnCompiledObject where the instruction is included +and 'instruction_index' is fetch the instruction object from complied runtime prograrm. + +It accomplishes the execution of the instruction according to the following steps: + 0. Set the shapes ot the output variables at InferShape function with + compilation result. + 1. Fetch the cinn instruction bound to this operator by 'cached_index' + and 'instruction_index' from CinnCompiler. + 2. Prepare the input and output variables of the instruction in Paddle and share + their buffers to CINN by setting 'memory' of according cinn_buffer_t. + 3. Launch CINN runtime to execute the instruction. + +)DOC"); + } +}; + +} // namespace paddle::operators + +namespace ops = paddle::operators; +using CPUDeviceContext = paddle::platform::CPUDeviceContext; +REGISTER_OPERATOR( + cinn_instruction_run, ops::CinnInstructionRunOp, + ops::CinnInstructionRunOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + cinn_instruction_run, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a1b00a182067b909a08fa50744bacfde39c5c830 --- /dev/null +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +using CUDADeviceContext = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL( + cinn_instruction_run, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel, + ops::CinnInstructionRunOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8847faa944bef228e418c347c486fa2b42090eed --- /dev/null +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/instruction.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/operators/cinn/cinn_launch_context.h" +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" + +namespace paddle::operators { + +using CinnInstruction = ::cinn::hlir::framework::Instruction; +using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject; +using CinnCompiler = framework::paddle2cinn::CinnCompiler; + +template +class CinnInstructionRunOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // step 1: fetch the cinn instruction bound to this operator + auto cached_index = ctx.template Attr(kCachedIndex); + auto ins_index = ctx.template Attr(kInstructionIndex); + const CinnCompiledObject& compiled_object = + CinnCompiler::GetInstance()->GetCompiledObject(cached_index); + const std::vector>& instructions = + compiled_object.runtime_program->GetRunInstructions(); + PADDLE_ENFORCE_LT(ins_index, instructions.size(), + platform::errors::InvalidArgument( + "Index(%ld) > instructions.size(%ld).", ins_index, + instructions.size())); + auto&& instruction = instructions.at(ins_index); + + // step 2: prepare the input and output arguments of the instruction + details::CinnLaunchContext* launch_context = + compiled_object.launch_context.get(); + auto share_argument_buffer_fn = [launch_context, + &ctx](const std::string& var_name) { + cinn_buffer_t* buffer = launch_context->GetCinnBufferOfVar(var_name); + framework::Variable* var = ctx.scope().GetVar(var_name); + auto* tensor = var->template GetMutable(); + buffer->memory = + reinterpret_cast(tensor->mutable_data(ctx.GetPlace())); + }; + std::vector in_args = ctx.InputNames(kX); + std::for_each(in_args.begin(), in_args.end(), share_argument_buffer_fn); + std::vector out_args = ctx.OutputNames(kOutputs); + std::for_each(out_args.begin(), out_args.end(), share_argument_buffer_fn); + + // step 3: launch CINN runtime to execute the instruction + // TODO(CtfGo): simplify format of arguments package as a vector in CINN + // and update this usage call + instruction->Run(&launch_context->FinalizeArguments(), false, + details::GetStream(ctx)); + } +}; + +} // namespace paddle::operators diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index fa93bf00f2ac0dcd0c3dcb778357dda7d9ce3518..282a8f69e4ec5c194bf5226132ced33ad02ac676 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -24,12 +24,31 @@ CinnLaunchContext::CinnLaunchContext( const std::unordered_map& paddle2cinn_varmap, const std::shared_ptr& cinn_scope) : paddle2cinn_varmap_(paddle2cinn_varmap), cinn_scope_(cinn_scope) { + // generate all names of cinn used variables auto var_names = cinn_scope_->var_names(); cinn_variable_names_.reserve(var_names.size()); std::transform( var_names.begin(), var_names.end(), std::inserter(cinn_variable_names_, cinn_variable_names_.end()), [](const auto& name_view) { return std::string(name_view.data()); }); + // build the variable name map of cinn2paddle + for (const auto& x : paddle2cinn_varmap_) { + auto res = cinn2paddle_varmap_.emplace(x.second, x.first); + PADDLE_ENFORCE_EQ( + res.second, true, + platform::errors::InvalidArgument( + "Cinn variable(%s) maps to more than one paddle variable(%s,%s)", + x.second, res.first->second, x.first)); + } + // supplement the relations of the remain variables not appearing in above + // map, + // they are internal variables and here we use the name from cinn compiled. + for (const auto& var_name : cinn_variable_names_) { + if (!cinn2paddle_varmap_.count(var_name)) { + cinn2paddle_varmap_.emplace(var_name, var_name); + paddle2cinn_varmap_.emplace(var_name, var_name); + } + } } void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope, @@ -189,6 +208,20 @@ CinnLaunchContext::FinalizeArguments() const { return name2argument_; } +cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar( + const std::string& paddle_var_name) { + auto res = paddle2cinn_varmap_.find(paddle_var_name); + PADDLE_ENFORCE_NE( + res, paddle2cinn_varmap_.end(), + platform::errors::InvalidArgument( + "Variable(%s) not found in compilation result", paddle_var_name)); + auto it = name2argument_.find(res->second); + PADDLE_ENFORCE_NE(it, name2argument_.end(), + platform::errors::InvalidArgument( + "Argument(%s) not be initialized", res->second)); + return static_cast(it->second); +} + } // namespace details } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index 7b71d77d8b8860264872b88d86a5cfe7ae82be96..71ddeb35420b52c12787cb3873fbe5b7d4f7b8c1 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -64,6 +64,8 @@ class CinnLaunchContext { // Finalize all execution arguments and return them const std::map& FinalizeArguments() const; + cinn_buffer_t* GetCinnBufferOfVar(const std::string& paddle_var_name); + private: // Get CinnTensor with CINN variable name CinnTensor GetCinnTensor(const std::string& var_name); @@ -84,19 +86,22 @@ class CinnLaunchContext { std::unique_ptr cached_temp_scope_ = nullptr; // a variable name map from paddle to cinn - const std::unordered_map& paddle2cinn_varmap_; + std::unordered_map paddle2cinn_varmap_; + // a variable name map from cinn to paddle + std::unordered_map cinn2paddle_varmap_; // the variable scope of cinn const std::shared_ptr cinn_scope_; - // all variables used by compiled executable program + // all names of cinn variables used by compiled executable program std::unordered_set cinn_variable_names_; // because a cinn_pod_value_t does not own the cinn_buffer_t object, // an extra stroage is necessary to keep the object and it can - // not be released until runtime program finish execution. + // not be released until the runtime program finish execution. std::vector> hold_buffers_; - // name to execution argument + // this map saves all execution arguments with their cinn names as key, + // and it is passed to the Execute interface of a cinn runtime program. std::map name2argument_; }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc index cd17c947228d6201b551410172246498f75f3b12..d918b7216c4d2f1e8cd0891d3a0dc0a5d2ed4339 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "paddle/fluid/operators/cinn/cinn_launch_op.h" - #include #include - +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/runtime/cinn_runtime.h" +#include "cinn/runtime/flags.h" #include "paddle/fluid/string/string_helper.h" DECLARE_bool(cudnn_deterministic); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc index ea36a19202ef0696918792210f20dd2c2818e700..9dfd53834e937e201a76d44d4a841f4625c24b19 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc @@ -13,36 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_launch_op.h" -#include -#include -#include "cinn/runtime/cinn_runtime.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/device/gpu/gpu_types.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -#ifdef PADDLE_WITH_CUDA -#include -#endif - -namespace paddle { -namespace operators { -namespace details { - -#ifdef PADDLE_WITH_CUDA -template <> -void* GetStream( - const framework::ExecutionContext& ctx) { - const auto& dev_ctx = - ctx.template device_context(); - return dev_ctx.stream(); -} -#endif - -} // namespace details -} // namespace operators -} // namespace paddle /* see [Why use single type kernel] */ REGISTER_OP_CUDA_KERNEL(cinn_launch, diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index 23dfa9d84c01203f3edbef6216cccbc340ffda52..bd9b30f559bdb5e6af3081125d9278ad21046cd7 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -18,27 +18,18 @@ #include #include #include -#include "cinn/hlir/framework/graph_compiler.h" -#include "cinn/hlir/framework/scope.h" -#include "cinn/runtime/cinn_runtime.h" -#include "cinn/runtime/flags.h" +#include "cinn/common/target.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" #include "paddle/fluid/operators/cinn/cinn_launch_context.h" +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" namespace paddle { namespace operators { -constexpr char kX[] = "X"; -constexpr char kNoNeedBufferX[] = "NoNeedBufferX"; -constexpr char kOutputs[] = "Out"; -constexpr char kCompilationKey[] = "compilation_key"; - using LoDTensor = framework::LoDTensor; -using CinnTensor = ::cinn::hlir::framework::Tensor; -using CinnScope = ::cinn::hlir::framework::Scope; using CinnCompiler = framework::paddle2cinn::CinnCompiler; using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject; @@ -57,17 +48,6 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj, // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS. void SetCinnRuntimeFlags(); -template -void* GetStream(const framework::ExecutionContext& ctx) { - return nullptr; -} - -#ifdef PADDLE_WITH_CUDA -template <> -void* GetStream( - const framework::ExecutionContext& ctx); -#endif - } // namespace details template diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 849cdb715049ba235f737117e0769ec0a9105942..b4cd91ea8a4bce6f8a2bbeb01d15f03cb5053de7 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" USE_OP(cinn_launch); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..3fb9c822c77c4ddb631a31610af0cc950c7533a8 --- /dev/null +++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle::operators::details { + +#ifdef PADDLE_WITH_CUDA +template <> +void* GetStream( + const framework::ExecutionContext& ctx) { + const auto& dev_ctx = + ctx.template device_context(); + return dev_ctx.stream(); +} +#endif + +} // namespace paddle::operators::details diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..e542134b94689692e88382b6506a9d87d4708fa2 --- /dev/null +++ b/paddle/fluid/operators/cinn/cinn_op_helper.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/operator.h" + +// We define some common names or utility functions +// for operators related to cinn in this file +namespace paddle::operators { + +// input params, output params and attributes +constexpr char kX[] = "X"; +constexpr char kNoNeedBufferX[] = "NoNeedBufferX"; +constexpr char kOutputs[] = "Out"; +constexpr char kCompilationKey[] = "compilation_key"; +constexpr char kCachedIndex[] = "cached_index"; +constexpr char kInstructionIndex[] = "instruction_index"; + +// utility functions +namespace details { + +template +void* GetStream(const framework::ExecutionContext& ctx) { + return nullptr; +} + +#ifdef PADDLE_WITH_CUDA +template <> +void* GetStream( + const framework::ExecutionContext& ctx); +#endif + +} // namespace details +} // namespace paddle::operators diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc index 9d27d99b3ab35835330e629f21502d05d635103a..199e2b6bc7fc6cb3ec82c550058c8df14980fc01 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -32,7 +32,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU); diff --git a/paddle/fluid/operators/complex_op.h b/paddle/fluid/operators/complex_op.h index 3dd5ea9f7e83dbfaa353378cfee10231c445c222..fb324277fb004b93718793346957b9adbb10143b 100644 --- a/paddle/fluid/operators/complex_op.h +++ b/paddle/fluid/operators/complex_op.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/complex_view_op.h b/paddle/fluid/operators/complex_view_op.h index 9a8d89db4020828cc9fdba90dd99ab7e5395864b..98ba732e2400421073fdbefc76ff4207fe5a9a8d 100644 --- a/paddle/fluid/operators/complex_view_op.h +++ b/paddle/fluid/operators/complex_view_op.h @@ -17,9 +17,9 @@ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 68720e70b09ad6098da4fd59c50bbb89a56c9dc7..a044506cef4bb480d30bc87f3b556560a4d61064 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -265,6 +265,18 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase { } }; +class ConditionalBlockGradInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + // NOTE(Aurelius84): VarType of Output is LoDTensor by default. In case of + // Input is {Tensor, LoDTensorArray}, we need synchronous the Input's + // VarType into Input@GRAD to avoid generating {Tensor, Tensor} as + // Input@GRAD. + ctx->SyncTypeAndDataType(ConditionalOp::kInputs, + framework::GradVarName(ConditionalOp::kInputs)); + } +}; + template class ConditionalBlockGradMaker : public framework::SingleGradOpMaker { public: @@ -300,4 +312,5 @@ REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp, ops::ConditionalBlockOpProtoMaker, ops::ConditionalBlockGradMaker); REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp, - ops::ConditionalBlockGradInferShape); + ops::ConditionalBlockGradInferShape, + ops::ConditionalBlockGradInferVarType); diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu index 2b69db7d24a128de77e8508970705ac3b9a1fb66..3a63bd99ad57d8c91235ddf50219e6b015321972 100644 --- a/paddle/fluid/operators/cumprod_op.cu +++ b/paddle/fluid/operators/cumprod_op.cu @@ -14,9 +14,9 @@ #include #include "paddle/fluid/operators/cumprod_op.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/math/inclusive_scan.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -243,12 +243,12 @@ class CumprodGradOpCUDAKernel : public framework::OpKernel { platform::ForRange for_range_x(dev_ctx, numel); - math::ConjFunctor functor_x(x_data, numel, x_data_conj); + pten::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); for_range_x(functor_x); platform::ForRange for_range_y(dev_ctx, numel); - math::ConjFunctor functor_y(y_data, numel, y_data_conj); + pten::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); for_range_y(functor_y); x_data_deal = x_data_conj; y_data_deal = y_data_conj; diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h index d8c3c1febdcf3ef336bc7c68ad9636bd9989c22e..15c3d514331b671817c97b0036a00b1279263dbb 100644 --- a/paddle/fluid/operators/cumprod_op.h +++ b/paddle/fluid/operators/cumprod_op.h @@ -18,8 +18,8 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -124,12 +124,12 @@ class CumprodGradOpCPUKernel : public framework::OpKernel { platform::ForRange for_range_x(dev_ctx, numel); - math::ConjFunctor functor_x(x_data, numel, x_data_conj); + pten::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); for_range_x(functor_x); platform::ForRange for_range_out(dev_ctx, numel); - math::ConjFunctor functor_out(out_data, numel, out_data_conj); + pten::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); for_range_out(functor_out); x_data_deal = x_data_conj; diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc index bab394689546e495a0f7892870c071f0fb7b3f06..c39f702a48644e529c429854125c386af2f3224d 100644 --- a/paddle/fluid/operators/detection/prior_box_op_xpu.cc +++ b/paddle/fluid/operators/detection/prior_box_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/detection/prior_box_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { @@ -81,21 +82,17 @@ class PriorBoxOpXPUKernel : public framework::OpKernel { dev_ctx.x_context(), boxes_data, aspect_ratios_param, min_sizes_param, max_sizes_param, feature_height, feature_width, img_height, img_width, offset, step_height, step_width, clip, min_max_aspect_ratios_order); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU gen_prior_box kernel return wrong value[%d %s]", - ret, XPUAPIErrorMsg[ret])); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gen_prior_box"); int box_num = feature_height * feature_width * num_priors; int vlen = variances.size(); + std::vector var_cpu(vlen * box_num); for (int i = 0; i < box_num; ++i) { - ret = xpu_memcpy(vars_data + i * vlen, variances.data(), vlen * sizeof(K), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External( - "XPU xpu_memcpy return wrong " - "value[%d %s] in prior_box.", - ret, XPUAPIErrorMsg[ret])); + std::copy(variances.begin(), variances.end(), var_cpu.begin() + i * vlen); } + ret = xpu_memcpy(vars_data, var_cpu.data(), var_cpu.size() * sizeof(K), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + PADDLE_ENFORCE_XPU_SUCCESS(ret); } }; diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 90443e0928ba2535498122ea00df479b83acb56f..1da680fbd953a7f86cef4e9db13d1e63336f9a29 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,11 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -395,7 +395,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel { size_t(numel * sizeof(T))); platform::ForRange for_range(dev_ctx, numel); - math::ConjFunctor functor(inverse_A.data(), numel, conj_data); + pten::funcs::ConjFunctor functor(inverse_A.data(), numel, conj_data); for_range(functor); VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index c5d43ef01264b097ffb9c17bc716bd3dcedf8ce0..52fc26342a1b441dff032c13e0b423504a77265e 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -16,8 +16,8 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index b4b6e2ce2fc5664e8016e7815e037933d461e80d..f822802d305e9b03ac9e00604121033c7a1faa6d 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -17,12 +17,12 @@ #include #include #include -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/math/lapack_function.h" #include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/math_function.h" #define EPSILON 1e-6 @@ -87,18 +87,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, int values_stride = values->dims()[values->dims().size() - 1]; Tensor rwork; - math::Real* rwork_data = nullptr; + pten::funcs::Real* rwork_data = nullptr; rwork.Resize(framework::make_ddim({lda * 2})); - rwork_data = rwork.mutable_data>(context.GetPlace()); + rwork_data = rwork.mutable_data>(context.GetPlace()); // call lapackEig once to compute the size of work; T computed_work_size; - math::lapackEig>( + math::lapackEig>( jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); - lwork = std::max(1, static_cast(math::Real(computed_work_size))); + lwork = std::max( + 1, static_cast(pten::funcs::Real(computed_work_size))); Tensor work; work.Resize(framework::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); @@ -108,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, T* current_values = &values_data[i * values_stride]; T* current_rvectors = &rvector_data[i * matrix_stride]; - math::lapackEig>( + math::lapackEig>( jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); PADDLE_ENFORCE_EQ( @@ -207,26 +208,27 @@ class EigKernel : public framework::OpKernel { origin_dim.push_back(last_item * 2); framework::DDim big_dim = framework::make_ddim(origin_dim); - real_values.mutable_data>(big_dim, context.GetPlace()); - real_vectors.mutable_data>(x->dims(), context.GetPlace()); + real_values.mutable_data>(big_dim, + context.GetPlace()); + real_vectors.mutable_data>(x->dims(), + context.GetPlace()); - ApplyEigKernel>(*x, &real_values, - &real_vectors, context); - auto dito = - math::DeviceIndependenceTensorOperations, - Tout>(context); + ApplyEigKernel>( + *x, &real_values, &real_vectors, context); + auto dito = math::DeviceIndependenceTensorOperations< + DeviceContext, pten::funcs::Real, Tout>(context); // 1. extract real part & imag part from real_values Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); // 2. construct complex values - auto* real_part_data = real_part.data>(); - auto* imag_part_data = imag_part.data>(); + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); int out_values_numel = out_values->numel(); platform::ForRange for_range( context.template device_context(), out_values_numel); - math::RealImagToComplexFunctor functor( + pten::funcs::RealImagToComplexFunctor functor( real_part_data, imag_part_data, out_values->mutable_data(context.GetPlace()), out_values_numel); for_range(functor); @@ -235,7 +237,7 @@ class EigKernel : public framework::OpKernel { Tensor real_vector_trans = dito.Transpose(real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); - ConstructComplexVectors, Tout>( + ConstructComplexVectors, Tout>( &out_vectors_trans, *out_values, real_vector_trans, context, batch_count, order); TransposeTwoAxis(out_vectors_trans, out_vectors, @@ -271,14 +273,14 @@ void ComputeBackwardForComplexInput( // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un = diag_unsqueezed.data>(); auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), static_cast(numel * sizeof(Tout))); auto& dev_ctx = context.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, - numel); + pten::funcs::RealToComplexFunctor functor(data_diag_un, + data_diag_un_com, numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h index ad9b0f598311b1f44ae3f7ec34bda8489e422fc7..77afaf681da939cdc089325d5e210e5960142f55 100644 --- a/paddle/fluid/operators/eigh_op.h +++ b/paddle/fluid/operators/eigh_op.h @@ -40,7 +40,7 @@ template class EighGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = math::Real; + using ValueType = pten::funcs::Real; auto& x_grad = *ctx.Output(framework::GradVarName("X")); x_grad.mutable_data(ctx.GetPlace()); auto& output_w = *ctx.Input("Eigenvalues"); diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h index d825833b0242240cb0bdbeaf1e85057fe23dc618..a069ea164c94c5c9ec50ece0d28042347ff4777f 100644 --- a/paddle/fluid/operators/eigvals_op.h +++ b/paddle/fluid/operators/eigvals_op.h @@ -20,9 +20,9 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/math/lapack_function.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -48,7 +48,7 @@ struct PaddleComplex< template using PaddleCType = typename PaddleComplex::type; template -using Real = typename math::Real; +using Real = typename pten::funcs::Real; static void SpiltBatchSquareMatrix(const Tensor& input, std::vector* output) { @@ -118,7 +118,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, platform::ForRange for_range( ctx.template device_context(), n_dim); - math::RealImagToComplexFunctor> functor( + pten::funcs::RealImagToComplexFunctor> functor( w_data, w_data + n_dim, output->template data>(), n_dim); for_range(functor); } @@ -143,7 +143,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_work_mem, work_mem)); int64_t rwork_mem = rwork->memory_size(); - int64_t required_rwork_mem = (n_dim << 1) * sizeof(Real); + int64_t required_rwork_mem = (n_dim << 1) * sizeof(pten::funcs::Real); PADDLE_ENFORCE_GE( rwork_mem, required_rwork_mem, platform::errors::InvalidArgument( @@ -153,11 +153,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_rwork_mem, rwork_mem)); int info = 0; - math::lapackEig>( + math::lapackEig>( 'N', 'N', static_cast(n_dim), a.template data(), static_cast(n_dim), output->template data(), NULL, 1, NULL, 1, work->template data(), static_cast(work_mem / sizeof(T)), - rwork->template data>(), &info); + rwork->template data>(), &info); std::string name = "framework::platform::dynload::cgeev_"; if (framework::TransToProtoVarType(input.dtype()) == @@ -187,10 +187,10 @@ class EigvalsKernel : public framework::OpKernel { // query workspace size T qwork; int info; - math::lapackEig>('N', 'N', static_cast(n_dim), - input_matrices[0].template data(), - static_cast(n_dim), NULL, NULL, 1, NULL, 1, - &qwork, -1, static_cast*>(NULL), &info); + math::lapackEig>( + 'N', 'N', static_cast(n_dim), input_matrices[0].template data(), + static_cast(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1, + static_cast*>(NULL), &info); int64_t lwork = static_cast(qwork); Tensor work, rwork; @@ -207,8 +207,8 @@ class EigvalsKernel : public framework::OpKernel { } if (framework::IsComplexType( framework::TransToProtoVarType(input->dtype()))) { - rwork.mutable_data>(framework::make_ddim({n_dim << 1}), - ctx.GetPlace()); + rwork.mutable_data>( + framework::make_ddim({n_dim << 1}), ctx.GetPlace()); } for (int64_t i = 0; i < n_batch; ++i) { diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index f462c2ea0720b600f238109704e9606a2f7d627c..53037c1fa653648044e2dc0981ec5c63351e7c15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -117,55 +117,6 @@ REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad, ops::ElementwiseTripleGradOpInplaceInferer, ops::ElementwiseTripleGradNoBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel>, - ops::ElementwiseAddKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel>, - ops::ElementwiseAddGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_add_grad_grad, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel>, - ops::ElementwiseAddDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_add_triple_grad, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel>, - ops::ElementwiseAddTripleGradKernel>); - // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. REGISTER_OPERATOR( diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2326aa561eaa05986c6e58bc1f2f2c93334cf893..b66cd01349d1ecb76307a6d6a24cf9b08d69cfb4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -18,51 +18,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -namespace paddle { -namespace operators {} // namespace operators -} // namespace paddle -REGISTER_OP_CUDA_KERNEL( - elementwise_add, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel>, - ops::ElementwiseAddKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel>, - ops::ElementwiseAddGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_add_grad_grad, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel>, - ops::ElementwiseAddDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_add_triple_grad, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel>, - ops::ElementwiseAddTripleGradKernel>); - REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 73415d3fdb5c83cac1c0a8afb67548d7fa09b3c3..6f2a1fe87d70913f3699ead365e53923a7eaf83d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -43,73 +43,5 @@ class ElementwiseAddKernel : public framework::OpKernel { } }; -template -class ElementwiseAddGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Y"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *dy = ctx.Output(framework::GradVarName("Y")); - const auto &dev_ctx = ctx.template device_context(); - int axis = ctx.Attr("axis"); - pten::AddGradKernel( - static_cast::TYPE &>(dev_ctx), - *x, *y, *dout, axis, dx, dy); - } -}; - -template -class ElementwiseAddDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - - auto *y = ctx.Input("Y"); - auto *dout = ctx.Input("DOut"); - auto *ddx = ctx.Input("DDX"); - auto *ddy = ctx.Input("DDY"); - - auto *ddout = ctx.Output("DDOut"); - const auto &dev_ctx = ctx.template device_context(); - int axis = ctx.Attr("axis"); - paddle::optional ddx_optional = paddle::none; - paddle::optional ddy_optional = paddle::none; - if (ddx != nullptr) { - ddx_optional = *ddx; - } - if (ddy != nullptr) { - ddy_optional = *ddy; - } - pten::AddDoubleGradKernel( - static_cast::TYPE &>(dev_ctx), - *y, ddx_optional, ddy_optional, *dout, axis, ddout); - } -}; - -template -class ElementwiseAddTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - auto *ddx = ctx.Input("DDX"); - auto *ddy = ctx.Input("DDY"); - auto *d_ddout = ctx.Input("D_DDOut"); - auto *d_ddx = ctx.Output("D_DDX"); - auto *d_ddy = ctx.Output("D_DDY"); - - const auto &dev_ctx = ctx.template device_context(); - int axis = ctx.Attr("axis"); - pten::AddTripleGradKernel( - static_cast::TYPE &>(dev_ctx), - *ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 63ec5bd4a2805e74b8a6552a53ac65fb55a0cdf5..4732762624a5f820698d228fb105529d845af049 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -31,7 +31,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); USE_OP(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 12d82654362ac125502a1b4b73c34226647ec99e..7efa1d24dcf1fe3c62d3177321e4c5e98e8f267d 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index 706475bc82fadef0eaf864d69fe3ceccb087d6f2..e1340de2096e08bcfc8d3010a87d56be869c749e 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 551d8ee6592dfcf39e15b5d5c3b40453847fb64d..94a6ba3139b1d700cfb7f3ce2cd02424da3f63bb 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -27,7 +27,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; USE_OP(matmul); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); // get paddle matmul op results as baseline template diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu old mode 100755 new mode 100644 index 0adbf0be4e28aa1d95b92a273f2a78851ca196ed..e34335e8597a75d594dc7271207f95c1599b2083 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -20,12 +20,11 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_dropout_test.h" -#include "paddle/fluid/operators/math/functors.h" +#include "paddle/pten/kernels/funcs/functors.h" namespace framework = paddle::framework; namespace platform = paddle::platform; namespace details = paddle::operators::details; -namespace math = paddle::operators::math; /** * @brief the unittest of fused_dropout_act_bias @@ -283,12 +282,14 @@ static void BaseTest(const bool is_fp16 = false) { } TEST(FusedDropout, GPUFusedDorpoutActBias) { - BaseTest, math::ReluGradFunctor>(); + BaseTest, + pten::funcs::ReluGradFunctor>(); BaseTest, paddle::operators::GeluGradFunctor>(); } TEST(FusedDropout, GPUFusedDropoutActBiasDouble) { - BaseTest, math::ReluGradFunctor>(); + BaseTest, + pten::funcs::ReluGradFunctor>(); BaseTest, paddle::operators::GeluGradFunctor>(); } @@ -296,15 +297,16 @@ TEST(FusedDropout, GPUFusedDropoutActBiasDouble) { // test fp16, For inference, check_grad is not required. ref: test_dropout_op.py TEST(FusedDropout, GPUFusedDropoutActBiasFp16) { using fp16 = platform::float16; - BaseTest, math::ReluGradFunctor>(true); + BaseTest, + pten::funcs::ReluGradFunctor>(true); } TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) { const int rows = 16; const int cols = 16; for (auto is_upscale_in_train : {true, false}) { - TestFusedDropoutActBias, - math::ReluGradFunctor> + TestFusedDropoutActBias, + pten::funcs::ReluGradFunctor> test(rows, cols, 0, 1.0, is_upscale_in_train, false); test.Run(); test.CheckOut(static_cast(1e-5)); @@ -315,8 +317,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) { TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) { const int rows = 16; const int cols = 16; - TestFusedDropoutActBias, - math::ReluGradFunctor> + TestFusedDropoutActBias, + pten::funcs::ReluGradFunctor> test(rows, cols, 0, 0.35, true, true); test.Run(); test.CheckOut(static_cast(1e-5)); @@ -326,8 +328,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) { TEST(FusedDropout, GPUFusedDropoutActBiasSeed) { const int rows = 16; const int cols = 16; - TestFusedDropoutActBias, - math::ReluGradFunctor> + TestFusedDropoutActBias, + pten::funcs::ReluGradFunctor> test(rows, cols, 125, 0.0, false, false); test.Run(); test.CheckOut(static_cast(1e-5)); @@ -337,8 +339,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasSeed) { TEST(FusedDropout, GPUFusedDropoutActBiasLargeShape) { const int rows = 256; const int cols = 4096; - TestFusedDropoutActBias, - math::ReluGradFunctor> + TestFusedDropoutActBias, + pten::funcs::ReluGradFunctor> test(rows, cols); test.Run(); test.CheckOut(static_cast(1e-5)); diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index eb651e4ea7b4fc7bd156f0915edec87175d44047..b21a5fb8219ba1b0bf4a8d3e6bef6ecda6e9a653 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -21,12 +21,12 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/pten/kernels/funcs/functors.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index 782c5d70ee07728b2a97730ef07f3e563b19ee4d..286f37f4496371501afe7296ef3aa4e492809ae8 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" -#include "paddle/fluid/operators/math/functors.h" +#include "paddle/pten/kernels/funcs/functors.h" namespace paddle { namespace operators { @@ -167,8 +167,8 @@ class FusedDropoutHelper { dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, dropout_param_.is_test, src, bias, out, mask, ctx); } else if (act_method == "relu") { - math::ReluFunctor relu; - LaunchDropoutActBias>( + pten::funcs::ReluFunctor relu; + LaunchDropoutActBias>( relu, dropout_param_.seed, rows_, cols_, increment, dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, dropout_param_.is_test, src, bias, out, mask, ctx); @@ -187,8 +187,8 @@ class FusedDropoutHelper { gelu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); } else if (act_method == "relu") { - math::ReluGradFunctor relu_grad; - LaunchDropoutActBiasGrad>( + pten::funcs::ReluGradFunctor relu_grad; + LaunchDropoutActBiasGrad>( relu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); } else { diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h index b7dd89a8a28adffc09b75a1845a79fb66c0b67c8..792069652cde8cc1d67bfe8146cb58bbb9297106 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h @@ -19,8 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/math/compound_functors.h" -#include "paddle/fluid/operators/math/functors.h" +#include "paddle/pten/kernels/funcs/compound_functors.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" +#include "paddle/pten/kernels/funcs/functors.h" namespace paddle { namespace operators { @@ -53,22 +54,22 @@ static void RunBinaryCompoundFunctor( // intermediate_out = Unary(Y) // out = Binary(X, Unary(Y)) // In this case, the shape of intermediate_out and out are different. - paddle::operators::math::BinaryCompoundFunctor + pten::funcs::BinaryCompoundFunctor compound_func(binary_functor, unary_functor); int axis = ctx.Attr("axis"); if (ctx.Attr("save_intermediate_out")) { - FusedElemwiseAndActComputeEx, - true /*KeepIntermediateValue*/, - false /*SameShapeOfIntermediateOutAndOut*/>( + FusedElemwiseAndActComputeEx< + DeviceContext, T, + pten::funcs::BinaryCompoundFunctor, + true /*KeepIntermediateValue*/, + false /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); } else { - FusedElemwiseAndActComputeEx, - false /*KeepIntermediateValue*/, - false /*SameShapeOfIntermediateOutAndOut*/>( + FusedElemwiseAndActComputeEx< + DeviceContext, T, + pten::funcs::BinaryCompoundFunctor, + false /*KeepIntermediateValue*/, + false /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); } } @@ -85,22 +86,22 @@ static void RunUnaryCompoundFunctors( // In this case, the shape of intermediate_out and out are the same. int axis = ctx.Attr("axis"); - paddle::operators::math::UnaryCompoundFunctor + pten::funcs::UnaryCompoundFunctor compound_func(unary_functor, binary_functor); if (ctx.Attr("save_intermediate_out")) { - FusedElemwiseAndActComputeEx, - true /*KeepIntermediateValue*/, - true /*SameShapeOfIntermediateOutAndOut*/>( + FusedElemwiseAndActComputeEx< + DeviceContext, T, + pten::funcs::UnaryCompoundFunctor, + true /*KeepIntermediateValue*/, + true /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); } else { - FusedElemwiseAndActComputeEx, - false /*KeepIntermediateValue*/, - true /*SameShapeOfIntermediateOutAndOut*/>( + FusedElemwiseAndActComputeEx< + DeviceContext, T, + pten::funcs::UnaryCompoundFunctor, + false /*KeepIntermediateValue*/, + true /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); } } @@ -120,13 +121,12 @@ static void RunBinaryCompoundGradFunctors( int axis = ctx.Attr("axis"); using BinaryCompoundDxFunctor = - paddle::operators::math::BinaryCompoundGradDxFunctor; - using BinaryCompoundDyFunctor = - paddle::operators::math::BinaryCompoundGradDyFunctor< - T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>; + pten::funcs::BinaryCompoundGradDxFunctor; + using BinaryCompoundDyFunctor = pten::funcs::BinaryCompoundGradDyFunctor< + T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>; using BinaryCompoundDIntermedaiteOutFunctor = - paddle::operators::math::BinaryCompoundGradDIntermedaiteOutFunctor< + pten::funcs::BinaryCompoundGradDIntermedaiteOutFunctor< T, BinaryGradFunctor, UnaryFunctor>; if (in_intermediate_out) { @@ -170,14 +170,12 @@ static void RunUnaryCompoundGradFunctors( // Z = Unary(Binary(X, Y)) int axis = ctx.Attr("axis"); - using UnaryCompoundDxFunctor = - paddle::operators::math::UnaryCompoundGradDxFunctor< - T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>; - using UnaryCompoundDyFunctor = - paddle::operators::math::UnaryCompoundGradDyFunctor< - T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>; + using UnaryCompoundDxFunctor = pten::funcs::UnaryCompoundGradDxFunctor< + T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>; + using UnaryCompoundDyFunctor = pten::funcs::UnaryCompoundGradDyFunctor< + T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>; using UnaryCompoundDIntermediateFunctor = - paddle::operators::math::UnaryCompoundGradDIntermediateFunctor< + pten::funcs::UnaryCompoundGradDIntermediateFunctor< T, UnaryGradFunctor, BinaryFunctor, InPlace>; if (in_intermediate_out) { @@ -219,69 +217,60 @@ static void RunFunctors(const framework::ExecutionContext &ctx, if (funcs_str == "elementwise_add,scale") { // Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundFunctor, - paddle::operators::math::ScaleFunctor>( - ctx, paddle::operators::math::AddFunctor(), - paddle::operators::math::ScaleFunctor(scale), in_x, in_y, outputs); + RunBinaryCompoundFunctor, + pten::funcs::ScaleFunctor>( + ctx, pten::funcs::AddFunctor(), pten::funcs::ScaleFunctor(scale), + in_x, in_y, outputs); } else if (funcs_str == "scale,elementwise_add") { // Z = Unary(Binary(X, Y)) T scale = static_cast(ctx.Attr("scale")); - RunUnaryCompoundFunctors, - paddle::operators::math::AddFunctor>( - ctx, paddle::operators::math::ScaleFunctor(scale), - paddle::operators::math::AddFunctor(), in_x, in_y, outputs); + RunUnaryCompoundFunctors, + pten::funcs::AddFunctor>( + ctx, pten::funcs::ScaleFunctor(scale), pten::funcs::AddFunctor(), + in_x, in_y, outputs); } else if (funcs_str == "elementwise_add,relu") { // Z = Binary(X, Unary(Y)) - RunBinaryCompoundFunctor, - paddle::operators::math::ReluFunctor>( - ctx, paddle::operators::math::AddFunctor(), - paddle::operators::math::ReluFunctor(), in_x, in_y, outputs); + RunBinaryCompoundFunctor, + pten::funcs::ReluFunctor>( + ctx, pten::funcs::AddFunctor(), pten::funcs::ReluFunctor(), in_x, + in_y, outputs); } else if (funcs_str == "relu,elementwise_add") { // Z = Unary(Binary(X, Y)) - RunUnaryCompoundFunctors, - paddle::operators::math::AddFunctor>( - ctx, paddle::operators::math::ReluFunctor(), - paddle::operators::math::AddFunctor(), in_x, in_y, outputs); + RunUnaryCompoundFunctors, + pten::funcs::AddFunctor>( + ctx, pten::funcs::ReluFunctor(), pten::funcs::AddFunctor(), in_x, + in_y, outputs); } else if (funcs_str == "elementwise_mul,scale") { // Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundFunctor, - paddle::operators::math::ScaleFunctor>( - ctx, paddle::operators::math::MulFunctor(), - paddle::operators::math::ScaleFunctor(scale), in_x, in_y, outputs); + RunBinaryCompoundFunctor, + pten::funcs::ScaleFunctor>( + ctx, pten::funcs::MultiplyFunctor(), + pten::funcs::ScaleFunctor(scale), in_x, in_y, outputs); } else if (funcs_str == "tanh,elementwise_add") { // Z = Unary(Binary(X, Y)) - RunUnaryCompoundFunctors, - paddle::operators::math::AddFunctor>( - ctx, paddle::operators::math::TanhFunctor(), - paddle::operators::math::AddFunctor(), in_x, in_y, outputs); + RunUnaryCompoundFunctors, + pten::funcs::AddFunctor>( + ctx, pten::funcs::TanhFunctor(), pten::funcs::AddFunctor(), in_x, + in_y, outputs); } else if (funcs_str == "elementwise_mul,tanh") { // Z = Binary(X, Unary(Y)) - RunBinaryCompoundFunctor, - paddle::operators::math::TanhFunctor>( - ctx, paddle::operators::math::MulFunctor(), - paddle::operators::math::TanhFunctor(), in_x, in_y, outputs); + RunBinaryCompoundFunctor, + pten::funcs::TanhFunctor>( + ctx, pten::funcs::MultiplyFunctor(), pten::funcs::TanhFunctor(), + in_x, in_y, outputs); } else if (funcs_str == "elementwise_mul,sigmoid") { // Z = Binary(X, Unary(Y)) - RunBinaryCompoundFunctor, - paddle::operators::math::SigmoidFunctor>( - ctx, paddle::operators::math::MulFunctor(), - paddle::operators::math::SigmoidFunctor(), in_x, in_y, outputs); + RunBinaryCompoundFunctor, + pten::funcs::SigmoidFunctor>( + ctx, pten::funcs::MultiplyFunctor(), + pten::funcs::SigmoidFunctor(), in_x, in_y, outputs); } else if (funcs_str == "gelu,elementwise_add") { // Z = Unary(Binary(X, Y)) - RunUnaryCompoundFunctors, - paddle::operators::math::AddFunctor>( - ctx, paddle::operators::math::GeluFunctor(), - paddle::operators::math::AddFunctor(), in_x, in_y, outputs); + RunUnaryCompoundFunctors, + pten::funcs::AddFunctor>( + ctx, pten::funcs::GeluFunctor(), pten::funcs::AddFunctor(), in_x, + in_y, outputs); } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s has not been implemented.", funcs_str)); @@ -301,95 +290,83 @@ static void RunGradFunctors( if (funcs_str == "elementwise_add_grad,scale_grad") { // The backward of Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::AddGradFunctor, - paddle::operators::math::ScaleFunctor, - paddle::operators::math::ScaleGradFunctor, InPlace>( - ctx, paddle::operators::math::AddGradFunctor(), - paddle::operators::math::ScaleFunctor(scale), - paddle::operators::math::ScaleGradFunctor(scale), in_x, in_y, in_out, + RunBinaryCompoundGradFunctors, + pten::funcs::ScaleFunctor, + pten::funcs::ScaleGradFunctor, InPlace>( + ctx, pten::funcs::AddGradFunctor(), + pten::funcs::ScaleFunctor(scale), + pten::funcs::ScaleGradFunctor(scale), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "scale_grad,elementwise_add_grad") { // The backward of Z = Unary(Binary(X, Y)) T scale = static_cast(ctx.Attr("scale")); RunUnaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::ScaleGradFunctor, - paddle::operators::math::AddFunctor, - paddle::operators::math::AddGradFunctor, InPlace>( - ctx, paddle::operators::math::ScaleGradFunctor(scale), - paddle::operators::math::AddFunctor(), - paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, - in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); + DeviceContext, T, pten::funcs::ScaleGradFunctor, + pten::funcs::AddFunctor, pten::funcs::AddGradFunctor, InPlace>( + ctx, pten::funcs::ScaleGradFunctor(scale), + pten::funcs::AddFunctor(), pten::funcs::AddGradFunctor(), in_x, + in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, + d_intermediate_out); } else if (funcs_str == "elementwise_add_grad,relu_grad") { // The backward of Z = Binary(X, Unary(Y)) RunBinaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::AddGradFunctor, - paddle::operators::math::ReluFunctor, - paddle::operators::math::ReluGradFunctor, InPlace>( - ctx, paddle::operators::math::AddGradFunctor(), - paddle::operators::math::ReluFunctor(), - paddle::operators::math::ReluGradFunctor(), in_x, in_y, in_out, + DeviceContext, T, pten::funcs::AddGradFunctor, + pten::funcs::ReluFunctor, pten::funcs::ReluGradFunctor, InPlace>( + ctx, pten::funcs::AddGradFunctor(), pten::funcs::ReluFunctor(), + pten::funcs::ReluGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "relu_grad,elementwise_add_grad") { // The backward of Z = Unary(Binary(X, Y)) RunUnaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::ReluGradFunctor, - paddle::operators::math::AddFunctor, - paddle::operators::math::AddGradFunctor, InPlace>( - ctx, paddle::operators::math::ReluGradFunctor(), - paddle::operators::math::AddFunctor(), - paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, + DeviceContext, T, pten::funcs::ReluGradFunctor, + pten::funcs::AddFunctor, pten::funcs::AddGradFunctor, InPlace>( + ctx, pten::funcs::ReluGradFunctor(), pten::funcs::AddFunctor(), + pten::funcs::AddGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "elementwise_mul_grad,scale_grad") { // The backward of Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::MulGradFunctor, - paddle::operators::math::ScaleFunctor, - paddle::operators::math::ScaleGradFunctor, InPlace>( - ctx, paddle::operators::math::MulGradFunctor(), - paddle::operators::math::ScaleFunctor(scale), - paddle::operators::math::ScaleGradFunctor(scale), in_x, in_y, in_out, + RunBinaryCompoundGradFunctors, + pten::funcs::ScaleFunctor, + pten::funcs::ScaleGradFunctor, InPlace>( + ctx, pten::funcs::MulGradFunctor(), + pten::funcs::ScaleFunctor(scale), + pten::funcs::ScaleGradFunctor(scale), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "tanh_grad,elementwise_add_grad") { // The backward of Z = Unary(Binary(X, Y)) RunUnaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::TanhGradFunctor, - paddle::operators::math::AddFunctor, - paddle::operators::math::AddGradFunctor, InPlace>( - ctx, paddle::operators::math::TanhGradFunctor(), - paddle::operators::math::AddFunctor(), - paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, + DeviceContext, T, pten::funcs::TanhGradFunctor, + pten::funcs::AddFunctor, pten::funcs::AddGradFunctor, InPlace>( + ctx, pten::funcs::TanhGradFunctor(), pten::funcs::AddFunctor(), + pten::funcs::AddGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "elementwise_mul_grad,tanh_grad") { // The backward of Z = Binary(X, Unary(Y)) RunBinaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::MulGradFunctor, - paddle::operators::math::TanhFunctor, - paddle::operators::math::TanhGradFunctor, InPlace>( - ctx, paddle::operators::math::MulGradFunctor(), - paddle::operators::math::TanhFunctor(), - paddle::operators::math::TanhGradFunctor(), in_x, in_y, in_out, + DeviceContext, T, pten::funcs::MulGradFunctor, + pten::funcs::TanhFunctor, pten::funcs::TanhGradFunctor, InPlace>( + ctx, pten::funcs::MulGradFunctor(), pten::funcs::TanhFunctor(), + pten::funcs::TanhGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "elementwise_mul_grad,sigmoid_grad") { // The backward of Z = Binary(X, Unary(Y)) - RunBinaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::MulGradFunctor, - paddle::operators::math::SigmoidFunctor, - paddle::operators::math::SigmoidGradFunctor, InPlace>( - ctx, paddle::operators::math::MulGradFunctor(), - paddle::operators::math::SigmoidFunctor(), - paddle::operators::math::SigmoidGradFunctor(), in_x, in_y, in_out, + RunBinaryCompoundGradFunctors, + pten::funcs::SigmoidFunctor, + pten::funcs::SigmoidGradFunctor, InPlace>( + ctx, pten::funcs::MulGradFunctor(), pten::funcs::SigmoidFunctor(), + pten::funcs::SigmoidGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "gelu_grad,elementwise_add_grad") { // The backward of Z = Unary(Binary(X, Y)) RunUnaryCompoundGradFunctors< - DeviceContext, T, paddle::operators::math::GeluGradFunctor, - paddle::operators::math::AddFunctor, - paddle::operators::math::AddGradFunctor, InPlace>( - ctx, paddle::operators::math::GeluGradFunctor(), - paddle::operators::math::AddFunctor(), - paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, + DeviceContext, T, pten::funcs::GeluGradFunctor, + pten::funcs::AddFunctor, pten::funcs::AddGradFunctor, InPlace>( + ctx, pten::funcs::GeluGradFunctor(), pten::funcs::AddFunctor(), + pten::funcs::AddGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index 911c2cda57504793059da160831411180bf6524e..ef61b78d6828170e0a6c0ce98fea4d7f467323f9 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -122,12 +122,12 @@ __global__ void FusedLayernormResidualDropoutBias( __shared__ U shared_mean[32]; __shared__ U shared_var[32]; - math::ReluFunctor relu; + pten::funcs::ReluFunctor relu; U mean_val = 0; U var_val = 0; for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) { FusedResidualDropoutBiasOneThread>( + pten::funcs::ReluFunctor>( row_id, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, mask, is_test, &mean_val, &var_val, relu); } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 2f5ec839fc2c73984cdec00f246c24d777321044..264e2e5f22d671318d0e73bed419717c1a024ced 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -115,12 +115,12 @@ __global__ void FusedResidualDropoutBias( curandStatePhilox4_32_10_t state; curand_init(seed, idx, increment, &state); const T factor = GetFactor(dropout_prob, is_upscale_in_train, is_test); - math::ReluFunctor relu; + pten::funcs::ReluFunctor relu; for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { FusedResidualDropoutBiasOneThread>( + pten::funcs::ReluFunctor>( r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, mask, is_test, nullptr, nullptr, relu); } diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc index 32cc38ef1953364266181598f44ccd54e9dc631c..2df6b539ff68aa4934dc2562792a55a58b670417 100644 --- a/paddle/fluid/operators/histogram_op.cc +++ b/paddle/fluid/operators/histogram_op.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/histogram_op.h" - #include #include #include +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -85,8 +85,3 @@ REGISTER_OPERATOR( histogram, ops::HistogramOp, ops::HistogramOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - histogram, ops::HistogramKernel, - ops::HistogramKernel, - ops::HistogramKernel, - ops::HistogramKernel); diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu deleted file mode 100644 index 48a637e6c37b1cf37e5653397ded01775eb54551..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/histogram_op.cu +++ /dev/null @@ -1,156 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/histogram_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/pten/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using IndexType = int64_t; -using Tensor = framework::Tensor; -using platform::PADDLE_CUDA_NUM_THREADS; - -inline int GET_BLOCKS(const int N) { - return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; -} - -template -__device__ static IndexType GetBin(T input_value, T min_value, T max_value, - int64_t nbins) { - IndexType bin = static_cast((input_value - min_value) * nbins / - (max_value - min_value)); - IndexType output_index = bin < nbins - 1 ? bin : nbins - 1; - return output_index; -} - -template -__global__ void KernelHistogram(const T* input, const int total_elements, - const int64_t nbins, const T min_value, - const T max_value, int64_t* output) { - extern __shared__ int64_t buf_hist[]; - for (int i = threadIdx.x; i < nbins; i += blockDim.x) { - buf_hist[i] = 0; - } - __syncthreads(); - - CUDA_KERNEL_LOOP(input_index, total_elements) { - // const IndexType input_index = threadIdx.x + blockIdx.x * blockDim.x; - const auto input_value = input[input_index]; - if (input_value >= min_value && input_value <= max_value) { - const IndexType output_index = - GetBin(input_value, min_value, max_value, nbins); - paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1); - } - } - __syncthreads(); - - for (int i = threadIdx.x; i < nbins; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]); - } -} - -template -class HistogramCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - - const Tensor* input = context.Input("X"); - Tensor* output = context.Output("Out"); - auto& nbins = context.Attr("bins"); - auto& minval = context.Attr("min"); - auto& maxval = context.Attr("max"); - - const T* input_data = input->data(); - const int input_numel = input->numel(); - - int64_t* out_data = output->mutable_data(context.GetPlace()); - pten::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - if (input_data == nullptr) return; - - T output_min = static_cast(minval); - T output_max = static_cast(maxval); - - if (output_min == output_max) { - auto input_x = framework::EigenVector::Flatten(*input); - - framework::Tensor input_min_t, input_max_t; - auto* input_min_data = - input_min_t.mutable_data({1}, context.GetPlace()); - auto* input_max_data = - input_max_t.mutable_data({1}, context.GetPlace()); - auto input_min_scala = framework::EigenScalar::From(input_min_t); - auto input_max_scala = framework::EigenScalar::From(input_max_t); - - auto* place = - context.template device_context().eigen_device(); - input_min_scala.device(*place) = input_x.minimum(); - input_max_scala.device(*place) = input_x.maximum(); - - Tensor input_min_cpu, input_max_cpu; - paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), - &input_min_cpu); - paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), - &input_max_cpu); - - output_min = input_min_cpu.data()[0]; - output_max = input_max_cpu.data()[0]; - } - if (output_min == output_max) { - output_min = output_min - 1; - output_max = output_max + 1; - } - - PADDLE_ENFORCE_EQ( - (std::isinf(static_cast(output_min)) || - std::isnan(static_cast(output_max)) || - std::isinf(static_cast(output_min)) || - std::isnan(static_cast(output_max))), - false, platform::errors::OutOfRange("range of min, max is not finite")); - PADDLE_ENFORCE_GE( - output_max, output_min, - platform::errors::InvalidArgument( - "max must be larger or equal to min. If min and max are both zero, " - "the minimum and maximum values of the data are used. " - "But received max is %d, min is %d", - maxval, minval)); - - auto stream = - context.template device_context().stream(); - KernelHistogram< - T, IndexType><<>>( - input_data, input_numel, nbins, output_min, output_max, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - histogram, - ops::HistogramCUDAKernel, - ops::HistogramCUDAKernel, - ops::HistogramCUDAKernel, - ops::HistogramCUDAKernel); diff --git a/paddle/fluid/operators/histogram_op.h b/paddle/fluid/operators/histogram_op.h deleted file mode 100644 index 9e280336e492af97d0107062f2d2a5ef22191133..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/histogram_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/pten/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class HistogramKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - Tensor* output = context.Output("Out"); - auto& nbins = context.Attr("bins"); - auto& minval = context.Attr("min"); - auto& maxval = context.Attr("max"); - - const T* input_data = input->data(); - auto input_numel = input->numel(); - - int64_t* out_data = output->mutable_data(context.GetPlace()); - pten::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - if (input_data == nullptr) return; - - T output_min = static_cast(minval); - T output_max = static_cast(maxval); - if (output_min == output_max) { - output_min = *std::min_element(input_data, input_data + input_numel); - output_max = *std::max_element(input_data, input_data + input_numel); - } - if (output_min == output_max) { - output_min = output_min - 1; - output_max = output_max + 1; - } - - PADDLE_ENFORCE_EQ( - (std::isinf(static_cast(output_min)) || - std::isnan(static_cast(output_max)) || - std::isinf(static_cast(output_min)) || - std::isnan(static_cast(output_max))), - false, platform::errors::OutOfRange("range of min, max is not finite")); - PADDLE_ENFORCE_GE( - output_max, output_min, - platform::errors::InvalidArgument( - "max must be larger or equal to min. If min and max are both zero, " - "the minimum and maximum values of the data are used. " - "But received max is %d, min is %d", - maxval, minval)); - - for (int64_t i = 0; i < input_numel; i++) { - if (input_data[i] >= output_min && input_data[i] <= output_max) { - const int64_t bin = (int64_t)((input_data[i] - output_min) * nbins / - (output_max - output_min)); - out_data[std::min(bin, nbins - 1)] += 1; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/imag_op.h b/paddle/fluid/operators/imag_op.h index 562a8dffa90623ed44c51ff1048c25550f5a7ce7..02682cfc954be57dd7900326dd98dae507fadeaa 100644 --- a/paddle/fluid/operators/imag_op.h +++ b/paddle/fluid/operators/imag_op.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -31,12 +31,13 @@ class ImagKernel : public framework::OpKernel { auto numel = x->numel(); auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - ctx.GetPlace(), static_cast(numel * sizeof(math::Real))); + auto* out_data = out->mutable_data>( + ctx.GetPlace(), + static_cast(numel * sizeof(pten::funcs::Real))); auto& dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::ImagFunctor functor(x_data, out_data, numel); + pten::funcs::ImagFunctor functor(x_data, out_data, numel); for_range(functor); } }; @@ -51,13 +52,13 @@ class ImagGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); + auto* dout_data = d_out->data>(); auto* dx_data = d_x->mutable_data( ctx.GetPlace(), static_cast(numel * sizeof(T))); auto& dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::ImagToComplexFunctor functor(dout_data, dx_data, numel); + pten::funcs::ImagToComplexFunctor functor(dout_data, dx_data, numel); for_range(functor); } }; diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc index b94182e9db73a5590ffa404508d2edda84983198..b5e2b0d776984327fa682efa2da9d961185c6433 100644 --- a/paddle/fluid/operators/lerp_op.cc +++ b/paddle/fluid/operators/lerp_op.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/lerp_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -132,15 +132,3 @@ REGISTER_OPERATOR( paddle::operators::LerpInplaceInferer); REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp); - -REGISTER_OP_CPU_KERNEL( - lerp, - paddle::operators::LerpKernel, - paddle::operators::LerpKernel); - -REGISTER_OP_CPU_KERNEL( - lerp_grad, - paddle::operators::LerpGradKernel, - paddle::operators::LerpGradKernel); diff --git a/paddle/fluid/operators/lerp_op.h b/paddle/fluid/operators/lerp_op.h deleted file mode 100644 index 380a8ccffd8af97b1072d0fa2083e7a60980030d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/lerp_op.h +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -#ifdef _WIN32 -#ifndef NOMINMAX -#define NOMINMAX // msvc max/min macro conflict with std::min/max -#endif -#endif - -namespace paddle { -namespace operators { - -static framework::DDim ExtendDims2Rank(const framework::DDim& in_dims, - int rank) { - if (in_dims.size() == rank) { - return in_dims; - } - std::vector shapes(rank, 1); - for (int i = in_dims.size() - 1, j = rank - 1; i >= 0; --i, --j) { - shapes[j] = in_dims[i]; - } - return framework::make_ddim(shapes); -} - -template -static void GetBroadcastDims(const framework::DDim& in_dims, - const framework::DDim& out_dims, - Eigen::DSizes* bcast_dims) { - for (size_t i = 0; i < D; ++i) { - if (in_dims[i] == out_dims[i]) { - (*bcast_dims)[i] = 1; - } else { - (*bcast_dims)[i] = std::max(in_dims[i], out_dims[i]); - } - } -} - -template -static void LerpFunction(const framework::ExecutionContext& ctx) { - auto x = ctx.Input("X"); - auto y = ctx.Input("Y"); - auto w = ctx.Input("Weight"); - auto out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto out_dims = out->dims(); - auto x_dims = ExtendDims2Rank(x->dims(), D); - auto y_dims = ExtendDims2Rank(y->dims(), D); - auto w_dims = ExtendDims2Rank(w->dims(), D); - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - Eigen::DSizes w_bcast_dims; - GetBroadcastDims(x_dims, out_dims, &x_bcast_dims); - GetBroadcastDims(y_dims, out_dims, &y_bcast_dims); - GetBroadcastDims(w_dims, out_dims, &w_bcast_dims); - - auto eigen_x = framework::EigenTensor::From(*x, x_dims); - auto eigen_y = framework::EigenTensor::From(*y, y_dims); - auto eigen_w = framework::EigenTensor::From(*w, w_dims); - auto eigen_out = framework::EigenTensor::From(*out); - - auto& place = *ctx.template device_context().eigen_device(); - eigen_out.device(place) = - eigen_x.broadcast(x_bcast_dims) + - eigen_w.broadcast(w_bcast_dims) * - (eigen_y.broadcast(y_bcast_dims) - eigen_x.broadcast(x_bcast_dims)); -} - -template -static void LerpGradFunction(const framework::ExecutionContext& ctx) { - auto w = ctx.Input("Weight"); - auto dout = ctx.Input(framework::GradVarName("Out")); - auto dx = ctx.Output(framework::GradVarName("X")); - auto dy = ctx.Output(framework::GradVarName("Y")); - - auto dout_dims = dout->dims(); - auto dx_dims = ExtendDims2Rank(dx->dims(), D); - auto dy_dims = ExtendDims2Rank(dy->dims(), D); - auto w_dims = ExtendDims2Rank(w->dims(), D); - Eigen::DSizes dx_bcast_dims; - Eigen::DSizes dy_bcast_dims; - Eigen::DSizes w_bcast_dims; - GetBroadcastDims(dx_dims, dout_dims, &dx_bcast_dims); - GetBroadcastDims(dy_dims, dout_dims, &dy_bcast_dims); - GetBroadcastDims(w_dims, dout_dims, &w_bcast_dims); - - auto eigen_w = framework::EigenTensor::From(*w, w_dims); - auto eigen_dout = framework::EigenTensor::From(*dout); - - Eigen::DSizes dx_reshape_dims; - Eigen::DSizes dy_reshape_dims; - Eigen::DSizes reduce_dims; - for (int i = 0; i < dout_dims.size(); ++i) { - dx_reshape_dims[2 * i] = dx_bcast_dims[i]; - dx_reshape_dims[2 * i + 1] = dx_dims[i]; - dy_reshape_dims[2 * i] = dy_bcast_dims[i]; - dy_reshape_dims[2 * i + 1] = dy_dims[i]; - reduce_dims[i] = 2 * i; - } - - auto& place = *ctx.template device_context().eigen_device(); - - if (dx) { - dx->mutable_data(ctx.GetPlace()); - auto eigen_dx = framework::EigenTensor::From(*dx, dx_dims); - auto eigen_expr = (1 - eigen_w.broadcast(w_bcast_dims)) * eigen_dout; - eigen_dx.device(place) = eigen_expr.reshape(dx_reshape_dims) - .sum(reduce_dims) - .reshape(eigen_dx.dimensions()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - auto eigen_dy = framework::EigenTensor::From(*dy, dy_dims); - auto eigen_expr = eigen_w.broadcast(w_bcast_dims) * eigen_dout; - eigen_dy.device(place) = eigen_expr.reshape(dy_reshape_dims) - .sum(reduce_dims) - .reshape(eigen_dy.dimensions()); - } -} - -template -class LerpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int rank = ctx.Output("Out")->dims().size(); - PADDLE_ENFORCE_GE( - rank, 1, - platform::errors::InvalidArgument( - "The number of dimensions for LerpOp must be " - "greater than or equal to 1, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, 6, platform::errors::InvalidArgument( - "The number of dimensions for LerpOp must be " - "less than or equal to 6, but the value received is %d.", - rank)); - switch (rank) { - case 1: - LerpFunction(ctx); - break; - case 2: - LerpFunction(ctx); - break; - case 3: - LerpFunction(ctx); - break; - case 4: - LerpFunction(ctx); - break; - case 5: - LerpFunction(ctx); - break; - case 6: - LerpFunction(ctx); - break; - } - } -}; - -template -class LerpGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int rank = ctx.Input(framework::GradVarName("Out")) - ->dims() - .size(); - PADDLE_ENFORCE_GE( - rank, 1, - platform::errors::InvalidArgument( - "The number of dimensions for LerpGradOp must be " - "greater than or equal to 1, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, 6, platform::errors::InvalidArgument( - "The number of dimensions for LerpGradOp must be " - "less than or equal to 6, but the value received is %d.", - rank)); - switch (rank) { - case 1: - LerpGradFunction(ctx); - break; - case 2: - LerpGradFunction(ctx); - break; - case 3: - LerpGradFunction(ctx); - break; - case 4: - LerpGradFunction(ctx); - break; - case 5: - LerpGradFunction(ctx); - break; - case 6: - LerpGradFunction(ctx); - break; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 6676cde1cafcabfcaee325bafe3be3703fe1a0a2..c677b4978eb3e3c03a3ae42a434ff0df3d55fe83 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -15,8 +15,9 @@ #include #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" +#include "paddle/pten/kernels/funcs/functors.h" namespace paddle { namespace operators { @@ -213,15 +214,15 @@ __global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { const AccT value = static_cast(input[data_offset + d * dim_stride]); - max_value = math::MaxFunctor()(max_value, value); + max_value = pten::funcs::MaxFunctor()(max_value, value); } // If there are more than 1 threads along block x, reduce all max_values // and get the global max_value, which is the max value along "axis". // If there is only one thread along block x, no need to reduce, as the // 'max_value' is the global max_value. if (blockDim.x > 1) { - max_value = - BlockReduceAlongDimX(sdata, max_value); + max_value = BlockReduceAlongDimX( + sdata, max_value); } // 2. reduce sum @@ -232,7 +233,7 @@ __global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( max_value); } if (blockDim.x > 1) { - sum = BlockReduceAlongDimX(sdata, sum); + sum = BlockReduceAlongDimX(sdata, sum); } // 3. input-max-log_sum and write to output diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index 4819bd725183248d8711b94c546e11e6d30026ab..f39d65d681f2f8f0e18c2fae13154d76b8b2f76c 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -18,7 +18,6 @@ #include #include #include "paddle/fluid/operators/eig_op.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/math/eigen_values_vectors.h" #include "paddle/fluid/operators/math/lapack_function.h" #include "paddle/fluid/operators/math/matrix_solve.h" @@ -26,6 +25,7 @@ #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/math_function.h" #define EPSILON 1e-6 @@ -46,7 +46,7 @@ template class LstsqCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - using ValueType = math::Real; + using ValueType = pten::funcs::Real; const Tensor& x = *context.Input("X"); auto y = context.Input("Y"); @@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel { &rwkopt, &info); } - lwork = std::max(1, static_cast(math::Real(wkopt))); + lwork = std::max(1, static_cast(pten::funcs::Real(wkopt))); Tensor work; work.Resize(framework::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 11174540cb0cd1f6e971c9fb85338b2eeb8bbfa0..0d05d766e67fb16c75d5fb0f9c798c7048a1c7f9 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -211,8 +211,9 @@ void Tensor_Conj(const DeviceContext& dev_ctx, const framework::Tensor& tensor, framework::Tensor* out) { out->Resize(tensor.dims()); platform::ForRange out_for_range(dev_ctx, tensor.numel()); - math::ConjFunctor out_functor(tensor.data(), tensor.numel(), - out->mutable_data(dev_ctx.GetPlace())); + pten::funcs::ConjFunctor out_functor( + tensor.data(), tensor.numel(), + out->mutable_data(dev_ctx.GetPlace())); out_for_range(out_functor); } diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index f9a4e963c0c478e2d4e4bb35b2ddf63e0ac7e8b8..0e6b63be90ef695801c8dc820985d3562ab429ae 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -813,6 +813,102 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } +template <> +template <> +inline void Blas::GEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, + platform::bfloat16 *C) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A, + CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "cublas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A, + CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + template <> template <> inline void Blas::GEMM( @@ -1208,6 +1304,42 @@ inline void Blas::GEMV(bool trans_a, int M, int N, } } +template <> +template <> +inline void Blas::GEMV( + bool trans_a, int M, int N, platform::bfloat16 alpha, + const platform::bfloat16 *A, const platform::bfloat16 *B, + platform::bfloat16 beta, platform::bfloat16 *C) const { + // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve + // it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, int M, int N, + platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve + // it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + template <> template void Blas::BatchedGEMM( @@ -1306,6 +1438,91 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 9010 } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, + strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc, + strideC, batchCount, CUBLAS_COMPUTE_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= " + "11")); +#endif // CUDA_VERSION >= 11000 +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, + strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc, + strideC, batchCount, CUBLAS_COMPUTE_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= " + "11")); +#endif // CUDA_VERSION >= 11000 +} + template <> template void Blas::BatchedGEMM( @@ -1356,6 +1573,32 @@ inline void Blas::BatchedGEMM( } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + template <> template void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h index 980caa9cfe68c64a1afd21a82d366b5228f8f026..9518da89edeb01a1dc35c2a6544ff2e55297a697 100644 --- a/paddle/fluid/operators/math/blas_impl.hip.h +++ b/paddle/fluid/operators/math/blas_impl.hip.h @@ -550,6 +550,84 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, rocblas_datatype_f16_r, N, rocblas_datatype_f32_r); } +template <> +template <> +inline void Blas::GEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + // TODO(zhiqiu): 80 has the same meaning for rocm and cuda? + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "rocblas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, + C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N, + rocblas_datatype_f32_r, algo, 0, 0)); + }); +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + // TODO(zhiqiu): 80 has the same meaning for rocm and cuda? + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "rocblas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, + C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N, + rocblas_datatype_f32_r, algo, 0, 0)); + }); +} + template <> template <> inline void Blas::GEMM( @@ -874,6 +952,39 @@ inline void Blas::GEMV(bool trans_a, int M, int N, } } +template <> +template <> +inline void Blas::GEMV( + bool trans_a, int M, int N, platform::bfloat16 alpha, + const platform::bfloat16 *A, const platform::bfloat16 *B, + platform::bfloat16 beta, platform::bfloat16 *C) const { + // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} +template <> +template <> +inline void Blas::GEMV(bool trans_a, int M, int N, + platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + template <> template void Blas::BatchedGEMM( @@ -898,6 +1009,7 @@ void Blas::BatchedGEMM( ldc, strideC, batchCount); }); } + template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -925,6 +1037,70 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, }); } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + const int64_t strideC = M * N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::rocblas_gemm_strided_batched_ex( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r, + lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C, + rocblas_datatype_bf16_r, ldc, strideC, batchCount, + rocblas_datatype_f32_r, algo, 0, 0)); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + const int64_t strideC = M * N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::rocblas_gemm_strided_batched_ex( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r, + lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C, + rocblas_datatype_bf16_r, ldc, strideC, batchCount, + rocblas_datatype_f32_r, algo, 0, 0)); + }); +} + template <> template void Blas::BatchedGEMM( @@ -935,6 +1111,7 @@ void Blas::BatchedGEMM( C[k]); } } + template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -973,6 +1150,32 @@ inline void Blas::BatchedGEMM( } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + template <> template void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 9ce615c949ffcb0e7ef300dfdc4f45b87604ad0c..b946d4d072ba2e276df632e5fea6960fbbe17975 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -63,7 +63,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = math::Real; + using ValueType = pten::funcs::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto dito = @@ -123,9 +123,9 @@ struct MatrixEighFunctor { for (auto i = 0; i < batch_size; i++) { auto *value_data = out_value + i * values_stride; auto *input_data = input_vector + i * vector_stride; - math::lapackEigh>(jobz, uplo, n, input_data, lda, value_data, - work_data, lwork, rwork_data, lrwork, - iwork_data, liwork, &info); + math::lapackEigh>( + jobz, uplo, n, input_data, lda, value_data, work_data, lwork, + rwork_data, lrwork, iwork_data, liwork, &info); CheckEighResult(i, info); } if (has_vectors) { @@ -151,7 +151,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = math::Real; + using ValueType = pten::funcs::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); @@ -233,7 +233,7 @@ struct MatrixEighFunctor { } } - using ValueType = math::Real; + using ValueType = pten::funcs::Real; inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W, int *lwork) const; diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 5fdc2889a88858769c4bdf445367dc60265d6cbf..1c750fcb832c1ca0fae51c6c5f818fe82923897e 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -26,9 +26,9 @@ namespace cub = hipcub; #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) { - using RealT = math::Real; + using RealT = pten::funcs::Real; constexpr auto kSharedBufferSize = framework::IsComplex::value ? 4 * kThreadNumX : 2 * kThreadNumX; __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize]; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc new file mode 100644 index 0000000000000000000000000000000000000000..506b57186965de8fff758a958cc0e87b374e64bc --- /dev/null +++ b/paddle/fluid/operators/math/math_function.cc @@ -0,0 +1,313 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef PADDLE_USE_OPENBLAS +#include +#endif + +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace operators { +namespace math { + +using float16 = paddle::platform::float16; + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; + +#ifdef PADDLE_WITH_XPU +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; +#endif + +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, RANK>; \ + template struct Transpose, RANK>; + +DEFINE_CPU_TRANS(1); +DEFINE_CPU_TRANS(2); +DEFINE_CPU_TRANS(3); +DEFINE_CPU_TRANS(4); +DEFINE_CPU_TRANS(5); +DEFINE_CPU_TRANS(6); + +template +struct TransposeNormal { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& in, framework::Tensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = framework::stride(in.dims()); + auto out_stride = framework::stride(out->dims()); + const T* in_ptr = in.data(); + T* out_ptr = out->data(); + + auto transpose_helper = [&](int64_t beg, int64_t end) { + for (int64_t out_idx = beg; out_idx < end; ++out_idx) { + int64_t in_idx = 0; + int64_t tmp_idx = out_idx; + // calculate the input index + for (int i = 0; i < rank; ++i) { + const int64_t coordinate = tmp_idx / out_stride[i]; + tmp_idx -= coordinate * out_stride[i]; + in_idx += coordinate * in_stride[axis[i]]; + } + out_ptr[out_idx] = in_ptr[in_idx]; + } + }; + transpose_helper(0, out->numel()); + } +}; + +// define transpose normal +#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ + template struct TransposeNormal + +DEFINE_CPU_TRANS_NORMAL(platform::float16); +DEFINE_CPU_TRANS_NORMAL(platform::bfloat16); +DEFINE_CPU_TRANS_NORMAL(float); +DEFINE_CPU_TRANS_NORMAL(double); +DEFINE_CPU_TRANS_NORMAL(int); +DEFINE_CPU_TRANS_NORMAL(int64_t); +DEFINE_CPU_TRANS_NORMAL(bool); +DEFINE_CPU_TRANS_NORMAL(int16_t); +DEFINE_CPU_TRANS_NORMAL(uint8_t); +DEFINE_CPU_TRANS_NORMAL(int8_t); +DEFINE_CPU_TRANS_NORMAL(platform::complex); +DEFINE_CPU_TRANS_NORMAL(platform::complex); + +struct TensorSetConstantCPU { + TensorSetConstantCPU(framework::Tensor* tensor, float value) + : tensor_(tensor), value_(value) {} + template + void apply() const { + auto cpu = platform::CPUPlace(); + auto* begin = tensor_->mutable_data(cpu); + std::fill(begin, begin + tensor_->numel(), static_cast(value_)); + } + framework::Tensor* tensor_; + float value_; +}; + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW( + platform::errors::Unimplemented("NPUPinnedPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); +} + +struct TensorSetConstantWithPlace : public boost::static_visitor { + TensorSetConstantWithPlace(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) + : context_(context), tensor_(tensor), value_(value) {} + + template + void operator()(Place place) const { + set_constant_with_place(context_, tensor_, value_); + } + + const platform::DeviceContext& context_; + framework::Tensor* tensor_; + float value_; +}; + +void set_constant(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) { + TensorSetConstantWithPlace func(context, tensor, value); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // tensor->place().apply_visitor(func); + paddle::platform::VisitPlace(tensor->place(), func); +#else + func(platform::CPUPlace()); +#endif +} + +template +struct RowwiseAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ( + vector.numel(), size, + platform::errors::InvalidArgument( + "The input vector size" + " should be equal to the size of each row of input tensor." + " Expected vector size=%d, but received %d", + size, vector.numel())); + const char* in_dims_cstr = in_dims.to_str().c_str(); + const char* out_dims_cstr = out_dims.to_str().c_str(); + PADDLE_ENFORCE_EQ(out_dims, in_dims, + platform::errors::InvalidArgument( + "The output tensor shape should be same as the input" + " tensor shape. Expected output tensor shape: %s," + " but received %s", + in_dims_cstr, out_dims_cstr)); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(vector); + auto out = framework::EigenMatrix::From(*output); + + for (int64_t i = 0; i < in_dims[0]; ++i) { + out.chip(i, 0) = in.chip(i, 0) + vec; + } + } +}; + +template struct RowwiseAdd; +template struct RowwiseAdd; + +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; + +template struct RowwiseSum; +template struct RowwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + +template +struct ElementwiseAddTo { + void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src, + framework::Tensor* dst) { + auto in = framework::EigenVector::Flatten(src); + auto out = framework::EigenVector::Flatten(*dst); + auto& place = *(ctx->eigen_device()); + out.device(place) = out + in; + } +}; + +template struct ElementwiseAddTo; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index eaec4b78f4fc0401c907fe0481d9b9e1da1b8ff4..40f2b625f65006061f24779c0aee2b92ec297890 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -527,7 +527,7 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker, ops::MatMulV2GradOpMaker); DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, - PT_INFER_META(pten::MatmulGradInferMeta)); + PT_INFER_META(pten::GeneralBinaryGradInferMeta)); REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad, ops::MatMulV2OpDoubleGradMaker, ops::MatMulV2OpDoubleGradMaker, diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index 0e1c6b82e41922cb1a7fd8404ffae1135e7872a0..6fac2d1038334528b87c056ae0d14a366432d5bc 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/dot_op.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" // only can include the headers in paddle/pten/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu index d974d7c1b78f15bb5e0f050b4e415af453e4349f..2df794fb794430910e71c0980154f682c3f4920d 100644 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ b/paddle/fluid/operators/matrix_rank_op.cu @@ -18,11 +18,11 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/matrix_rank_op.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/math_function.h" namespace paddle { @@ -93,8 +93,8 @@ class MatrixRankGPUKernel : public framework::OpKernel { info_ptr); platform::ForRange for_range( dev_ctx, eigenvalue_tensor.numel()); - math::AbsFunctor functor(eigenvalue_data, eigenvalue_data, - eigenvalue_tensor.numel()); + pten::funcs::AbsFunctor functor(eigenvalue_data, eigenvalue_data, + eigenvalue_tensor.numel()); for_range(functor); } else { Tensor U, VH; diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc index 8efd2b226cad6f27c12036d863dba5a60ebf586f..2c84218c48e0bcc2d22d032bf5b3e949424aec3a 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -71,9 +71,6 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext& dev_ctx, auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, reorder_dst_memory_p); - paddle::platform::RecordEvent record_reorder( - "int_reorder", paddle::platform::EventRole::kUniqueOp); - auto& astream = MKLDNNDeviceContext::tls().get_stream(); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index c44f22dd02face48fe344ea2ee91ead4e9836837..deb8c735c8b0260d80c016439b8f1ae6765b56c5 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -106,12 +106,8 @@ class QuantOpKernel : public framework::OpKernel { reorder_p = std::shared_ptr(new reorder(*reorder_pd)); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *src_memory, *dst_memory); - astream.wait(); - } + reorder_p->execute(astream, *src_memory, *dst_memory); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory)); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index 1b1bd69aec2f4d88a65d66fd9a59d9ea9c78ee66..963f10441f9bdf2ac3369a770c43ca92ac21a7bf 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -138,12 +138,9 @@ class ReQuantOpKernel : public framework::OpKernel { } auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *src_memory, *dst_memory); - astream.wait(); - } + + reorder_p->execute(astream, *src_memory, *dst_memory); + astream.wait(); output->set_layout(framework::DataLayout::kMKLDNN); output->set_format(platform::GetMKLDNNFormat(*dst_memory)); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 5df2a546812adec055573a0d9c2c5c373fbed928..9c63afff13c22c7ad4ec283f2b25c2bc4535e6d1 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -174,12 +174,9 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { output, in_out.format(), ctx.GetPlace()); auto reorder_p = reorder_handler.AcquireReorder(target_mem, dst_mem); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *dst_mem, *target_mem); - astream.wait(); - } + + reorder_p->execute(astream, *dst_mem, *target_mem); + astream.wait(); } output->set_layout(framework::DataLayout::kMKLDNN); output->set_format(platform::GetMKLDNNFormat(*dst_mem)); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 7251653793f89900efa5382db74201a1fc232574..7bd2eb5c5eba6733c2c52f745b28fa4230d12b64 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -25,7 +25,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 0612417c46ce30a73ce0cbc582be740023ff0ab6..6be0e703e564ceb397ea90c810f4018388b2838e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -25,7 +25,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index b1001b4e5684be02df4784711ad459cd2005affb..82ea75943dee41c1c52b2f6e6f1bb9a71fa4a8f3 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/operator.h" @@ -1150,6 +1151,18 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output, workspace_ptr, workspace_size)); } +/* static */ void MLUCnnl::AdaptivePoolingForward( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlTensorDescriptor_t index_desc, void* index) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlAdaptivePoolingForward(handle, input_desc, input, pool_mode, + output_desc, output, index_desc, index)); +} + /* static */ void MLUCnnl::Pool3D( const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, const std::vector& output_shape, @@ -1801,6 +1814,17 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { y, diff_y_desc, diff_y, x_desc, x, beta, diff_x_desc, diff_x)); } +/* static */ void MLUCnnl::AdaptivePoolingBackward( + const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t index_desc, const void* index, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlAdaptivePoolingBackward( + handle, y_desc, y, index_desc, index, pool_mode, diff_x_desc, diff_x)); +} + /* static */ void MLUCnnl::NonMaxSuppression( const ExecutionContext& ctx, const cnnlNmsDescriptor_t nms_desc, const cnnlTensorDescriptor_t boxes_desc, const void* boxes, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index ad912c034683f491ef782e1494a96e2442865385..91eddaf792e8aed0097aecba2c8295ed65262b50 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -649,6 +649,12 @@ class MLUCnnl { const void* input, const void* beta, const void* extra_input_ptr, const cnnlTensorDescriptor_t output_desc, void* output); + static void AdaptivePoolingForward( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlTensorDescriptor_t index_desc, void* index); + static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, const std::vector& output_shape, cnnlPoolingDescriptor_t pooling_desc, const void* alpha, @@ -958,6 +964,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta, const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + static void AdaptivePoolingBackward( + const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t index_desc, const void* index, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + static void PoolingIndex(const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, const cnnlTensorDescriptor_t x_desc, const void* x, diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index 7c1cf9109c566625743f69de8cf3213855600c69..b96fcaa486cce8099cf1d03c7d948ea74c1923ad 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -18,7 +18,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add_grad); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index a64a9c274ed7dcf96abc43e34e9a21a4dbe7a6be..1bbd671323e6d9b189844556d1071d55e7fba57c 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -21,12 +21,12 @@ namespace operators { namespace { cnnlPoolingMode_t ToCnnlPoolingMode(const std::string &pooling_type, - bool exclusive) { + bool exclusive, bool adaptive) { cnnlPoolingMode_t pooling_mode; if (pooling_type == "max") { pooling_mode = CNNL_POOLING_MAX; } else if (pooling_type == "avg") { - if (exclusive) { + if (exclusive && !adaptive) { pooling_mode = CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; } else { pooling_mode = CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; @@ -64,10 +64,7 @@ class MLUPoolOpKernel : public framework::OpKernel { platform::errors::InvalidArgument( "Only support 4-dims for mlu pool2d kernel.")); - PADDLE_ENFORCE_EQ(adaptive, false, - platform::errors::InvalidArgument( - "Not support adaptive for mlu pool2d kernel.")); - + const bool channel_last = data_format == "NHWC"; // default cnnlTensorLayout_t cnnl_layout = CNNL_LAYOUT_NCHW; auto out_dims = out->dims(); @@ -77,7 +74,6 @@ class MLUPoolOpKernel : public framework::OpKernel { framework::DDim data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size()); - const bool channel_last = data_format == "NHWC"; if (channel_last) { cnnl_layout = CNNL_LAYOUT_NHWC; out_h = out_dims[1]; @@ -94,42 +90,74 @@ class MLUPoolOpKernel : public framework::OpKernel { MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); MLUCnnlTensorDesc out_desc(*out, cnnl_layout, ToCnnlDataType()); - cnnlPoolingMode_t pool_mode = ToCnnlPoolingMode(pooling_type, exclusive); - MLUCnnlPoolingDesc pool_desc( - pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0], - paddings[1], paddings[2], paddings[3], strides[0], strides[1], - 1 /*row_dilation*/, 1 /*col_dilation*/, ceil_mode); + cnnlPoolingMode_t pool_mode = + ToCnnlPoolingMode(pooling_type, exclusive, adaptive); + + if (!adaptive) { + MLUCnnlPoolingDesc pool_desc( + pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0], + paddings[1], paddings[2], paddings[3], strides[0], strides[1], + 1 /*row_dilation*/, 1 /*col_dilation*/, ceil_mode); + + size_t extra_input_size = 0; + cnnlHandle_t handle = + ctx.template device_context().cnnl_handle(); + cnnlGetPoolingExtraInputSize(handle, pool_mode, out_w, out_h, + &extra_input_size); - size_t extra_input_size = 0; - cnnlHandle_t handle = - ctx.template device_context().cnnl_handle(); - cnnlGetPoolingExtraInputSize(handle, pool_mode, out_w, out_h, - &extra_input_size); - - if (extra_input_size > 0) { - paddle::platform::CPUDeviceContext cpu_ctx; - framework::Tensor extra_host_tensor = - ctx.AllocateTmpTensor( - {static_cast(extra_input_size)}, cpu_ctx); - cnnlInitPoolingExtraInput(handle, pool_desc.get(), in_x_desc.get(), - out_desc.get(), GetBasePtr(&extra_host_tensor)); - framework::Tensor extra_device_tensor = - ctx.AllocateTmpTensor( - {static_cast(extra_input_size)}, dev_ctx); - // TODO(fwg): use Async copy, and add a callback to stream that free host - // memory. - framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(), - &extra_device_tensor); - MLUCnnl::PoolingForward( - ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, - in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, - GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/, out_desc.get(), - GetBasePtr(out)); + if (extra_input_size > 0) { + paddle::platform::CPUDeviceContext cpu_ctx; + framework::Tensor extra_host_tensor = + ctx.AllocateTmpTensor( + {static_cast(extra_input_size)}, cpu_ctx); + cnnlInitPoolingExtraInput(handle, pool_desc.get(), in_x_desc.get(), + out_desc.get(), + GetBasePtr(&extra_host_tensor)); + framework::Tensor extra_device_tensor = + ctx.AllocateTmpTensor( + {static_cast(extra_input_size)}, dev_ctx); + // TODO(fwg): use Async copy, and add a callback to stream that free + // host + // memory. + framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(), + &extra_device_tensor); + MLUCnnl::PoolingForward( + ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, + in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, + GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/, + out_desc.get(), GetBasePtr(out)); + } else { + MLUCnnl::PoolingForward( + ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, + in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, + nullptr /*params_shape_ptr*/, out_desc.get(), GetBasePtr(out)); + } } else { - MLUCnnl::PoolingForward( - ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, - in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, - nullptr /*params_shape_ptr*/, out_desc.get(), GetBasePtr(out)); + // cnnl Adaptive pooling only support NHWC layout + framework::Tensor trans_in_x; + framework::Tensor trans_out; + if (channel_last) { + trans_in_x = *in_x; + trans_out = *out; + } else { + std::vector perm{0, 2, 3, 1}; + TransposeFromMLUTensor(ctx, perm, in_x, &trans_in_x, + true /*need_reshape_or_alloc*/); + trans_out = ctx.AllocateTmpTensor( + {out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx); + } + MLUCnnlTensorDesc trans_in_x_desc(trans_in_x, CNNL_LAYOUT_NHWC, + ToCnnlDataType()); + MLUCnnlTensorDesc trans_out_desc(trans_out, CNNL_LAYOUT_NHWC, + ToCnnlDataType()); + MLUCnnl::AdaptivePoolingForward( + ctx, pool_mode, trans_in_x_desc.get(), GetBasePtr(&trans_in_x), + trans_out_desc.get(), GetBasePtr(&trans_out), nullptr, nullptr); + if (!channel_last) { + std::vector perm{0, 3, 1, 2}; + TransposeFromMLUTensor(ctx, perm, &trans_out, out, + false /*need_reshape_or_alloc*/); + } } } }; @@ -204,7 +232,8 @@ class MLUPoolGradOpKernel : public framework::OpKernel { MLUCnnlTensorDesc trans_in_x_grad_desc(trans_in_x_grad, CNNL_LAYOUT_NHWC, ToCnnlDataType()); - cnnlPoolingMode_t pool_mode = ToCnnlPoolingMode(pooling_type, exclusive); + cnnlPoolingMode_t pool_mode = + ToCnnlPoolingMode(pooling_type, exclusive, adaptive); MLUCnnlPoolingDesc pool_desc( pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0], paddings[1], paddings[2], paddings[3], strides[0], strides[1], @@ -219,18 +248,34 @@ class MLUPoolGradOpKernel : public framework::OpKernel { MLUCnnl::PoolingIndex(ctx, pool_desc.get(), trans_in_x_desc.get(), GetBasePtr(&trans_in_x), index_tensor_desc.get(), GetBasePtr(&index_tensor)); - MLUCnnl::PoolingBackward( - ctx, pool_desc.get(), nullptr /*alpha*/, index_tensor_desc.get(), - GetBasePtr(&index_tensor), trans_out_grad_desc.get(), - GetBasePtr(&trans_out_grad), trans_in_x_desc.get(), - GetBasePtr(&trans_in_x), nullptr /*beta*/, trans_in_x_grad_desc.get(), - GetBasePtr(&trans_in_x_grad)); + if (adaptive) { + MLUCnnl::AdaptivePoolingBackward( + ctx, pool_mode, trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), index_tensor_desc.get(), + GetBasePtr(&index_tensor), trans_in_x_grad_desc.get(), + GetBasePtr(&trans_in_x_grad)); + } else { + MLUCnnl::PoolingBackward( + ctx, pool_desc.get(), nullptr /*alpha*/, index_tensor_desc.get(), + GetBasePtr(&index_tensor), trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), trans_in_x_desc.get(), + GetBasePtr(&trans_in_x), nullptr /*beta*/, + trans_in_x_grad_desc.get(), GetBasePtr(&trans_in_x_grad)); + } } else { - MLUCnnl::PoolingBackward(ctx, pool_desc.get(), nullptr /*alpha*/, nullptr, - nullptr, trans_out_grad_desc.get(), - GetBasePtr(&trans_out_grad), nullptr, nullptr, - nullptr /*beta*/, trans_in_x_grad_desc.get(), - GetBasePtr(&trans_in_x_grad)); + if (adaptive) { + MLUCnnl::AdaptivePoolingBackward( + ctx, pool_mode, trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), nullptr /*index_tensor_desc.get()*/, + nullptr /*GetBasePtr(&index_tensor)*/, trans_in_x_grad_desc.get(), + GetBasePtr(&trans_in_x_grad)); + } else { + MLUCnnl::PoolingBackward(ctx, pool_desc.get(), nullptr /*alpha*/, + nullptr, nullptr, trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), nullptr, nullptr, + nullptr /*beta*/, trans_in_x_grad_desc.get(), + GetBasePtr(&trans_in_x_grad)); + } } if (!channel_last) { std::vector perm{0, 3, 1, 2}; diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index c8b6404830cdac0427f99b2e1d2c642fe8aa0f38..dfeec15d9b887aa55b81004b728a7c31fc8b4be7 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -56,12 +56,13 @@ class QrGPUKernel : public framework::OpKernel { int tau_stride = min_mn; if (compute_q) { - q.mutable_data>( + q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(math::Real))); + size_t(batch_size * m * k * sizeof(pten::funcs::Real))); } - r.mutable_data>( - context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + r.mutable_data>( + context.GetPlace(), + size_t(batch_size * k * n * sizeof(pten::funcs::Real))); auto dito = math::DeviceIndependenceTensorOperations { // Note: allocate temporary tensors because of lacking in-place operatios. // Prepare qr Tensor qr; - qr.mutable_data>( - context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real))); + qr.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * n * sizeof(pten::funcs::Real))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input paddle::framework::TensorCopy(x, context.GetPlace(), &qr); @@ -124,7 +126,8 @@ class QrGPUKernel : public framework::OpKernel { for (int i = 0; i < batch_size; ++i) { memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), dev_ctx.GetPlace(), (qr_data + i * qr_stride), - qr_stride * sizeof(math::Real), dev_ctx.stream()); + qr_stride * sizeof(pten::funcs::Real), + dev_ctx.stream()); } BatchedOrgqr( dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, tau_data, diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index c55619a4f76e7f316c6c7bcb689e2a101e5908eb..b8308b29106be39bacbf05028809e7206ea63cec 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -18,9 +18,9 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -74,17 +74,20 @@ class QrCPUKernel : public framework::OpKernel { int q_stride = m * k; int r_stride = k * n; - auto* x_data = x.data>(); + auto* x_data = x.data>(); T* q_data = nullptr; if (compute_q) { - q_data = q.mutable_data>( + q_data = q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(math::Real))); - memset(q_data, 0, size_t(batch_size * m * k * sizeof(math::Real))); + size_t(batch_size * m * k * sizeof(pten::funcs::Real))); + memset(q_data, 0, + size_t(batch_size * m * k * sizeof(pten::funcs::Real))); } - auto* r_data = r.mutable_data>( - context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(math::Real))); + auto* r_data = r.mutable_data>( + context.GetPlace(), + size_t(batch_size * k * n * sizeof(pten::funcs::Real))); + memset(r_data, 0, + size_t(batch_size * k * n * sizeof(pten::funcs::Real))); // Implement QR by calling Eigen for (int i = 0; i < batch_size; ++i) { @@ -140,7 +143,7 @@ class QrGradKernel : public framework::OpKernel { // Use a different name dA instead of dX framework::Tensor& dA = *ctx.Output(framework::GradVarName("X")); - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); pten::funcs::SetConstant()(dev_ctx, &dA, T(0)); @@ -222,7 +225,7 @@ class QrGradKernel : public framework::OpKernel { } else { // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V] // Calculate dX and dY individually and concatenate them to get dA - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto Y = dito.Slice(A, {-1}, {m}, {n}); auto U = dito.Slice(R, {-1}, {0}, {m}); diff --git a/paddle/fluid/operators/real_op.h b/paddle/fluid/operators/real_op.h index 6cc9065269c62716b54c329d46711ff96f83f015..41549393f578ff6109b629a6036cbbef108b398c 100644 --- a/paddle/fluid/operators/real_op.h +++ b/paddle/fluid/operators/real_op.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -31,12 +31,13 @@ class RealKernel : public framework::OpKernel { auto numel = x->numel(); auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - ctx.GetPlace(), static_cast(numel * sizeof(math::Real))); + auto* out_data = out->mutable_data>( + ctx.GetPlace(), + static_cast(numel * sizeof(pten::funcs::Real))); auto& dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::RealFunctor functor(x_data, out_data, numel); + pten::funcs::RealFunctor functor(x_data, out_data, numel); for_range(functor); } }; @@ -51,13 +52,13 @@ class RealGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); + auto* dout_data = d_out->data>(); auto* dx_data = d_x->mutable_data( ctx.GetPlace(), static_cast(numel * sizeof(T))); auto& dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, numel); - math::RealToComplexFunctor functor(dout_data, dx_data, numel); + pten::funcs::RealToComplexFunctor functor(dout_data, dx_data, numel); for_range(functor); } }; diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h index 5dcb81c75407f7c4b4a2f787d04e3085f366b348..a27b6ae90f29a14af5e0a119fb3f5d0182dafa7c 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h @@ -85,9 +85,6 @@ class ReduceMKLDNNKernel : public framework::OpKernel { auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, reorder_dst_memory_p); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/operators/renorm_op.h b/paddle/fluid/operators/renorm_op.h index 461f383ad25639fe2db9b64eb490ad1e7a769a4a..753ed9e27ac0918b7f36cd347b190b80714ccde5 100644 --- a/paddle/fluid/operators/renorm_op.h +++ b/paddle/fluid/operators/renorm_op.h @@ -17,8 +17,8 @@ #include "math.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 77c4a2005e3bf71c46b24e75d8c929507d2ca8a0..74095d2ce4e657f247f49818d9280295c68d5247 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -657,30 +657,6 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp, ops::ReshapeDoubleGradInplaceInferer, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL_FUNCTOR( - reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t, - ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel, - paddle::platform::bfloat16, ops::ReshapeKernel, - paddle::platform::complex, ops::ReshapeKernel, - paddle::platform::complex, ops::ReshapeKernel); - -REGISTER_OP_CPU_KERNEL_FUNCTOR( - reshape2_grad, float, ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, - ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool, - ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel, - paddle::platform::complex, ops::ReshapeGradKernel, - paddle::platform::complex, ops::ReshapeGradKernel); -REGISTER_OP_CPU_KERNEL_FUNCTOR( - reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, - ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t, - ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool, - ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16, - ops::ReshapeDoubleGradKernel, paddle::platform::complex, - ops::ReshapeDoubleGradKernel, paddle::platform::complex, - ops::ReshapeDoubleGradKernel); - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, @@ -695,45 +671,4 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel, plat::bfloat16, ops::ReshapeGradKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - uint8_t, ops::ReshapeKernel, int64_t, - ops::ReshapeKernel, plat::float16, - ops::ReshapeKernel, bool, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel, - plat::bfloat16, ops::ReshapeKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR( - reshape2_grad, float, ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, - ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16, - ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex, - ops::ReshapeGradKernel, plat::complex, ops::ReshapeGradKernel, - plat::bfloat16, ops::ReshapeGradKernel); - -REGISTER_OP_CUDA_KERNEL_FUNCTOR( - reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, - ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t, - ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, - plat::float16, ops::ReshapeDoubleGradKernel, bool, - ops::ReshapeDoubleGradKernel, plat::complex, - ops::ReshapeDoubleGradKernel, plat::complex, - ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel); -#endif - -#ifdef PADDLE_WITH_XPU -REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel, plat::float16, - ops::ReshapeKernel, bool, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel); -REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, - double, ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel, plat::float16, - ops::ReshapeGradKernel, bool, - ops::ReshapeGradKernel, plat::complex, - ops::ReshapeGradKernel, plat::complex, - ops::ReshapeGradKernel); #endif diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index c932834db39b30c50746aeee80fcd32b5090f58f..77703637db5cd7d34c865083bd765e1122b7fefb 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -20,11 +20,11 @@ #include #include "paddle/fluid/operators/conj_op.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/spectral_helper.h" #include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -115,8 +115,8 @@ void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config, framework::Tensor input_conj(input->type()); input_conj.mutable_data(input->dims(), ctx.GetPlace()); platform::ForRange for_range(ctx, input->numel()); - math::ConjFunctor functor(input->data(), input->numel(), - input_conj.data()); + pten::funcs::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); for_range(functor); exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward); } else if (fft_type == FFTTransformType::R2C && !forward) { @@ -126,8 +126,8 @@ void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config, exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward); platform::ForRange for_range(ctx, output->numel()); - math::ConjFunctor functor(out_conj.data(), output->numel(), - output->data()); + pten::funcs::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); for_range(functor); } else { exec_cufft_plan_raw(config, input->data(), output->data(), forward); @@ -227,8 +227,8 @@ void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config, framework::Tensor input_conj(input->type()); input_conj.mutable_data(input->dims(), ctx.GetPlace()); platform::ForRange for_range(ctx, input->numel()); - math::ConjFunctor functor(input->data(), input->numel(), - input_conj.data()); + pten::funcs::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); for_range(functor); exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward); } else if (fft_type == FFTTransformType::R2C && !forward) { @@ -238,8 +238,8 @@ void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config, exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward); platform::ForRange for_range(ctx, output->numel()); - math::ConjFunctor functor(out_conj.data(), output->numel(), - output->data()); + pten::funcs::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); for_range(functor); } else { exec_hipfft_plan_raw(config, input->data(), output->data(), forward); diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index 3a57a7b3e54cc3313654d20256b888efdb4baf5a..4384e7152fa4e56554a3effd7e82b56b03a1c585 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -25,9 +25,9 @@ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/math_function.h" namespace paddle { @@ -105,7 +105,8 @@ struct RealMulComplexFunctor { "The image part of y must to be 0" "but got [%d]", y.imag)); - return platform::complex>(x.real * y.real, x.imag * y.real); + return platform::complex>(x.real * y.real, + x.imag * y.real); } }; @@ -390,11 +391,11 @@ struct DeviceIndependenceTensorOperations { // batch_diag for CPU only Tensor BatchDiag(const Tensor& x, int batch) { Tensor out; - auto* x_data = x.data>(); + auto* x_data = x.data>(); auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(math::Real))); + static_cast(numel * sizeof(pten::funcs::Real))); auto x_dims = x.dims(); int num_dims = x_dims.size(); @@ -654,7 +655,7 @@ struct DeviceIndependenceTensorOperations { auto* out_data = out.mutable_data(x.dims(), context.GetPlace()); auto* x_data = x.data(); auto for_range = GetForRange(x.numel()); - math::ConjFunctor functor(x_data, x.numel(), out_data); + pten::funcs::ConjFunctor functor(x_data, x.numel(), out_data); for_range(functor); return out; } @@ -662,12 +663,12 @@ struct DeviceIndependenceTensorOperations { Tensor Real(const Tensor& x) { Tensor out; auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(math::Real))); + static_cast(numel * sizeof(pten::funcs::Real))); auto* x_data = x.data(); auto for_range = GetForRange(numel); - math::RealFunctor functor(x_data, out_data, numel); + pten::funcs::RealFunctor functor(x_data, out_data, numel); for_range(functor); return out; } diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h index f387dca7b7f9b2c4e741d8f495a58b05a46c6c6f..4042fcccf33090e11f14ec0effc1e5b9ddd95258 100644 --- a/paddle/fluid/operators/svd_op.h +++ b/paddle/fluid/operators/svd_op.h @@ -17,9 +17,9 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel { int col_u = full ? rows : k; int col_v = full ? cols : k; int batches = numel / (rows * cols); - auto* U_out = U->mutable_data>( + auto* U_out = U->mutable_data>( context.GetPlace(), - size_t(batches * rows * col_u * sizeof(math::Real))); - auto* VH_out = VH->mutable_data>( + size_t(batches * rows * col_u * sizeof(pten::funcs::Real))); + auto* VH_out = VH->mutable_data>( context.GetPlace(), - size_t(batches * col_v * cols * sizeof(math::Real))); - auto* S_out = S->mutable_data>( - context.GetPlace(), size_t(batches * k * sizeof(math::Real))); + size_t(batches * col_v * cols * sizeof(pten::funcs::Real))); + auto* S_out = S->mutable_data>( + context.GetPlace(), size_t(batches * k * sizeof(pten::funcs::Real))); /*SVD Use the Eigen Library*/ math::BatchSvd(x_data, U_out, VH_out, S_out, rows, cols, batches, full); } diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index 60eeb66ae7d1eca6e093432bfdc4e5f12f47f2e9..29ba5bcc1b5bb27528ee01bbf85208978cb4f97c 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" USE_OP(relu); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(softmax); namespace paddle { diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h index f64b016366e39b2260f4f8aebbb2e371ee2a8a7a..e892d258f3b126c0f6532f215e411837a415ee27 100644 --- a/paddle/fluid/operators/triangular_solve_op.h +++ b/paddle/fluid/operators/triangular_solve_op.h @@ -19,10 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace paddle { namespace operators { @@ -152,7 +152,7 @@ class TriangularSolveGradKernel : public framework::OpKernel { // calculate x's conjugate for complex Tensor x_conj(x->type()); platform::ForRange x_for_range(dev_ctx, x->numel()); - math::ConjFunctor x_functor( + pten::funcs::ConjFunctor x_functor( x->data(), x->numel(), x_conj.mutable_data(x->dims(), dev_ctx.GetPlace())); x_for_range(x_functor); @@ -179,7 +179,7 @@ class TriangularSolveGradKernel : public framework::OpKernel { // calculate out's conjugate for complex Tensor out_conj(out->type()); platform::ForRange out_for_range(dev_ctx, out->numel()); - math::ConjFunctor out_functor( + pten::funcs::ConjFunctor out_functor( out->data(), out->numel(), out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); out_for_range(out_functor); diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index 2cd068badf51e8a3176de4ec80700ce7057862d1..ecad5340d71c1ae32339ab1c79bf37d947402747 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,3 +1,18 @@ +IF(WITH_CUSTOM_DEVICE) +cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place) + +cc_library(device_guard SRCS device_guard.cc DEPS enforce place) + +cc_library(stream SRCS stream.cc DEPS callback_manager) + +cc_library(event SRCS event.cc DEPS enforce place) + +cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags) + +ENDIF() + +set(DEV_LIBS custom_device) + # GPU IF(WITH_GPU OR WITH_ROCM) add_subdirectory(gpu) @@ -22,3 +37,11 @@ ENDIF() IF(WITH_MLU) add_subdirectory(mlu) ENDIF() + +# CUSTOM +IF(WITH_CUSTOM_DEVICE) + add_subdirectory(custom) + + cc_library(device_manager SRCS device_manager.cc DEPS custom_device) + set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library") +ENDIF() diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/fluid/platform/device/callback_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..c677bc0262f0cfba0a5995afbde9e04f4bb0337e --- /dev/null +++ b/paddle/fluid/platform/device/callback_manager.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +CallbackManager::CallbackManager(stream::Stream *stream) + : stream_(stream), thread_pool_(1) {} + +void CallbackManager::AddCallback(std::function callback) const { + auto *callback_func = new std::function(std::move(callback)); + auto *func = new std::function([this, callback_func] { + std::lock_guard lock(mtx_); + last_future_ = thread_pool_.enqueue([callback_func] { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); + }); + + platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) + ->AddCallback(stream_, func); +} + +void CallbackManager::Wait() const { + platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) + ->SynchronizeStream(stream_); + + { + std::lock_guard lock(mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/fluid/platform/device/callback_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..0edc694c94bb7846ac6081bccc0dc7fecd61adcb --- /dev/null +++ b/paddle/fluid/platform/device/callback_manager.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + +#include +#include // NOLINT +#include +#include // NOLINT + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +namespace stream { +class Stream; +} // namespace stream + +// NOTE(zjl): clean CallbackManager to make compilation faster +// Make CallbackManager thread-safe +class CallbackManager { + public: + explicit CallbackManager(stream::Stream* stream); + + ~CallbackManager() = default; + + void AddCallback(std::function callback) const; + + void Wait() const; + + private: + stream::Stream* stream_; + mutable ::ThreadPool thread_pool_; + mutable std::mutex mtx_; + mutable std::future last_future_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f39c60c0c68edcdaca4bd4a0b25a9ec07453280e --- /dev/null +++ b/paddle/fluid/platform/device/custom/CMakeLists.txt @@ -0,0 +1,4 @@ +IF(WITH_CUSTOM_DEVICE) +cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context) +cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context ) +ENDIF() diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/fluid/platform/device/custom/custom_device.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5b98d3e2289588144e864bcbaed98f345bfad3c --- /dev/null +++ b/paddle/fluid/platform/device/custom/custom_device.cc @@ -0,0 +1,672 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/device_base.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/device_context.h" + +static bool operator==(const C_Device_st& d1, const C_Device_st& d2) { + return d1.id == d2.id; +} + +namespace paddle { +namespace platform { + +class CustomDevice : public DeviceInterface { + public: + CustomDevice(const std::string& type, int priority, bool is_custom, + std::unique_ptr pimpl, void* dso_handle) + : DeviceInterface(type, priority, is_custom), + pimpl_(std::move(pimpl)), + dso_handle_(dso_handle) { + Initialize(); + } + + ~CustomDevice() override { Finalize(); } + + size_t GetDeviceCount() override { + size_t count; + if (pimpl_->get_device_count(&count) != C_SUCCESS) { + count = 0; + } + return count; + } + + std::vector GetDeviceList() override { + size_t count = GetDeviceCount(); + std::vector devices(count); + pimpl_->get_device_list(devices.data()); + return devices; + } + + C_DeviceInterface* Impl() { return pimpl_.get(); } + + void SynchronizeDevice(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_device(device)); + } + + void Initialize() override { + if (pimpl_->initialize && pimpl_->initialize() != C_SUCCESS) { + LOG(ERROR) << "Initialize " << Type() << " Failed\n"; + exit(-1); + } + auto devices = GetDeviceList(); + for (auto dev_id : devices) { + C_Device_st device; + device.id = dev_id; + devices_pool[dev_id] = device; + InitDevice(dev_id); + } + } + + void Finalize() override { + auto devices = GetDeviceList(); + for (auto dev_id : devices) { + // SetDevice(dev_id); + // SynchronizeDevice(dev_id); + DeInitDevice(dev_id); + } + + bool ok = true; + if (pimpl_->finalize && pimpl_->finalize() != C_SUCCESS) { + LOG(ERROR) << "Finalize " << Type() << " Failed\n"; + ok = false; + } + if (dso_handle_) { + dlclose(dso_handle_); + dso_handle_ = nullptr; + } + if (!ok) { + exit(1); + } + } + + void InitDevice(size_t dev_id) override { + if (pimpl_->init_device) { + // Core set logical id, and Plugin replace it with physical id + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->init_device(device)); + } + } + + void DeInitDevice(size_t dev_id) override { + if (pimpl_->deinit_device) { + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->deinit_device(device)); + } + } + + void SetDevice(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->set_device(device)); + } + + int GetDevice() override { + C_Device_st device; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->get_device(&device)); + return device.id; + } + + void CreateStream(size_t dev_id, stream::Stream* stream, + const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, + const stream::Stream::Flag& flag = + stream::Stream::Flag::kDefaultFlag) override { + if (priority != stream::Stream::Priority::kNormal || + flag != stream::Stream::Flag::kDefaultFlag) { + PADDLE_THROW(platform::errors::Unavailable( + "priority != stream::Stream::Priority::kNormal || flag != " + "stream::Stream::Flag::kDefaultFlag is not allowed on " + "CustomDevice.")); + } + const auto device = &devices_pool[dev_id]; + C_Stream c_stream; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->create_stream(device, &c_stream)); + stream->set_stream(c_stream); + } + + void DestroyStream(size_t dev_id, stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream( + device, reinterpret_cast(stream->raw_stream()))); + } + + void SynchronizeStream(size_t dev_id, const stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream( + device, reinterpret_cast(stream->raw_stream()))); + } + + bool QueryStream(size_t dev_id, const stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->query_stream) { + SynchronizeStream(dev_id, stream); + return true; + } + if (pimpl_->query_stream(device, reinterpret_cast( + stream->raw_stream())) == C_SUCCESS) { + return true; + } + return false; + } + + void AddCallback(size_t dev_id, stream::Stream* stream, + stream::Stream::Callback* callback) override { + if (!pimpl_->stream_add_callback) { + PADDLE_THROW(platform::errors::Unavailable( + "AddCallback is not supported on %s.", Type())); + } else { + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback( + device, reinterpret_cast(stream->raw_stream()), + [](C_Device device, C_Stream stream, void* user_data, + C_Status* status) { + std::unique_ptr> func( + reinterpret_cast*>(user_data)); + (*func)(); + }, + callback)); + } + } + + void CreateEvent(size_t dev_id, event::Event* event, + event::Event::Flag flags) override { + const auto device = &devices_pool[dev_id]; + C_Event c_event; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->create_event(device, &c_event)); + event->set_event(c_event); + } + + void DestroyEvent(size_t dev_id, event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_event( + device, reinterpret_cast(event->raw_event()))); + } + + void RecordEvent(size_t dev_id, const event::Event* event, + const stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event( + device, reinterpret_cast(stream->raw_stream()), + reinterpret_cast(event->raw_event()))); + } + + void SynchronizeEvent(size_t dev_id, const event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_event( + device, reinterpret_cast(event->raw_event()))); + } + + bool QueryEvent(size_t dev_id, const event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->query_event) { + SynchronizeEvent(dev_id, event); + return true; + } + if (pimpl_->query_event(device, reinterpret_cast( + event->raw_event())) == C_SUCCESS) { + return true; + } + return false; + } + + void StreamWaitEvent(size_t dev_id, const stream::Stream* stream, + const event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event( + device, reinterpret_cast(stream->raw_stream()), + reinterpret_cast(event->raw_event()))); + } + + void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + const auto device = &devices_pool[dev_id]; + auto place = platform::CustomPlace(Type(), dev_id); + + if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) { + C_Stream c_stream = reinterpret_cast(stream->raw_stream()); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_h2d(device, dst, src, size)); + } + } + + void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + const auto device = &devices_pool[dev_id]; + auto place = platform::CustomPlace(Type(), dev_id); + + if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) { + C_Stream c_stream = reinterpret_cast(stream->raw_stream()); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_d2h(device, dst, src, size)); + } + } + + void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + const auto device = &devices_pool[dev_id]; + auto place = platform::CustomPlace(Type(), dev_id); + + if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) { + C_Stream c_stream = reinterpret_cast(stream->raw_stream()); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_d2d(device, dst, src, size)); + } + } + + void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id, + const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + int dst_dev_id = PlaceToId(dst_place); + auto dst_device = &devices_pool[dst_dev_id]; + auto src_device = &devices_pool[src_dev_id]; + + if (stream && stream->raw_stream()) { + if (!pimpl_->async_memory_copy_p2p) { + MemoryCopyP2P(dst_place, dst, src_dev_id, src, size); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p( + dst_device, src_device, + reinterpret_cast(stream->raw_stream()), dst, src, size)); + } + } else { + if (!pimpl_->memory_copy_p2p) { + std::unique_ptr tmp(new uint8_t[size]); + MemoryCopyD2H(src_dev_id, tmp.get(), src, size); + MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size); + } else { + auto src_place = platform::CustomPlace(Type(), src_dev_id); + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(src_place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size)); + } + } + } + + void* MemoryAllocate(size_t dev_id, size_t size) override { + void* ptr = nullptr; + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_allocate(device, &ptr, size)); + return ptr; + } + + void MemoryDeallocate(size_t dev_id, void* ptr, size_t size) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_deallocate(device, ptr, size)); + } + + void* MemoryAllocateHost(size_t dev_id, size_t size) override { + void* ptr = nullptr; + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->unified_memory_allocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Host is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->host_memory_allocate(device, &ptr, size)); + } + return ptr; + } + + void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->host_memory_deallocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Host is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->host_memory_deallocate(device, ptr, size)); + } + } + + void* MemoryAllocateUnified(size_t dev_id, size_t size) override { + void* ptr = nullptr; + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->unified_memory_allocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Unified is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->unified_memory_allocate(device, &ptr, size)); + } + return ptr; + } + + void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->unified_memory_deallocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Host is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->unified_memory_deallocate(device, ptr, size)); + } + } + + void MemorySet(size_t dev_id, void* ptr, uint8_t value, + size_t size) override { + const auto device = &devices_pool[dev_id]; + + if (pimpl_->device_memory_set) { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_set(device, ptr, value, size)); + } else { + std::unique_ptr tmp(new uint8_t[size]); + memset(tmp.get(), value, size); + MemoryCopyH2D(dev_id, ptr, tmp.get(), size); + } + } + + void MemoryStats(size_t dev_id, size_t* total, size_t* free) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_stats(device, total, free)); + + size_t used = *total - *free; + VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/" + << (*total >> 20) << "M, " << (*free >> 20) + << "M available to allocate"; + } + + size_t GetMinChunkSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t size = 0; + pimpl_->device_min_chunk_size(device, &size); + VLOG(10) << Type() << " min chunk size " << size << "B"; + return size; + } + + size_t GetMaxChunkSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t size = 0; + if (pimpl_->device_max_chunk_size) { + pimpl_->device_max_chunk_size(device, &size); + VLOG(10) << Type() << " max chunk size " << size << "B"; + } else { + return DeviceInterface::GetMaxChunkSize(dev_id); + } + return size; + } + + size_t GetMaxAllocSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t size = 0; + if (pimpl_->device_max_alloc_size) { + pimpl_->device_max_alloc_size(device, &size); + VLOG(10) << Type() << " max alloc size " << (size >> 20) << "M"; + } else { + return DeviceInterface::GetMaxAllocSize(dev_id); + } + return size; + } + + size_t GetInitAllocSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + size_t size = 0; + if (pimpl_->device_init_alloc_size) { + pimpl_->device_init_alloc_size(device, &size); + VLOG(10) << Type() << " init alloc size " << (size >> 20) << "M"; + } else { + return DeviceInterface::GetInitAllocSize(dev_id); + } + return size; + } + + size_t GetReallocSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + size_t size = 0; + if (pimpl_->device_realloc_size) { + pimpl_->device_realloc_size(device, &size); + VLOG(10) << Type() << " realloc size " << (size >> 20) << "M"; + } else { + return DeviceInterface::GetReallocSize(dev_id); + } + return size; + } + + size_t GetExtraPaddingSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t padding_size = 0; + if (pimpl_->device_extra_padding_size) { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_extra_padding_size(device, &padding_size)); + VLOG(10) << Type() << " extra padding size " << (padding_size >> 20) + << "M"; + } else { + return DeviceInterface::GetExtraPaddingSize(dev_id); + } + return 0; + } + + size_t GetComputeCapability() override { + size_t compute_capability = 0; + if (pimpl_->get_compute_capability) { + pimpl_->get_compute_capability(&compute_capability); + } + VLOG(10) << Type() << " get compute capability " << compute_capability; + return compute_capability; + } + + size_t GetRuntimeVersion() override { + size_t version = 0; + if (pimpl_->get_runtime_version) { + pimpl_->get_runtime_version(&version); + } + VLOG(10) << Type() << " get runtime version " << version; + return version; + } + + size_t GetDriverVersion() override { + size_t version = 0; + if (pimpl_->get_driver_version) { + pimpl_->get_driver_version(&version); + } + VLOG(10) << Type() << " get driver version " << version; + return version; + } + + private: + inline int PlaceToIdNoCheck(const Place& place) { + int dev_id = place.GetDeviceId(); + return dev_id; + } + + inline int PlaceToId(const Place& place) { + int dev_id = PlaceToIdNoCheck(place); + PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(), + platform::errors::NotFound( + "Cannot found %s %d, please check visible devices", + Type(), dev_id)); + return dev_id; + } + + std::unique_ptr pimpl_; + void* dso_handle_; + std::unordered_map devices_pool; +}; + +bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) { +#define CHECK_PTR(ptr, required) \ + if (params->interface->ptr == nullptr && required) { \ + LOG(WARNING) << "CustomRuntime [type: " << params->device_type \ + << "] pointer: " << #ptr << " is not set."; \ + return false; \ + } + + int version = params->version.major * 10000 + params->version.minor * 100 + + params->version.patch; + const int runtime_version = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION * 10000 + + PADDLE_CUSTOM_RUNTIME_MINOR_VERSION * 100 + + PADDLE_CUSTOM_RUNTIME_PATCH_VERSION; + + if (version < runtime_version) { + LOG(WARNING) << "CustomRuntime [type: " << params->device_type + << "] version: " << version + << " < PADDLE_CUSTOM_RUNTIME_VERSION " << runtime_version; + return false; + } + + CHECK_PTR(initialize, false); + CHECK_PTR(finalize, false) + + CHECK_PTR(init_device, false); + CHECK_PTR(set_device, true); + CHECK_PTR(get_device, true); + CHECK_PTR(deinit_device, false); + + CHECK_PTR(create_stream, true); + CHECK_PTR(destroy_stream, true); + CHECK_PTR(query_stream, false); + CHECK_PTR(stream_add_callback, false); + + CHECK_PTR(create_event, true); + CHECK_PTR(record_event, true); + CHECK_PTR(destroy_event, true); + CHECK_PTR(query_event, false); + + CHECK_PTR(synchronize_device, false); + CHECK_PTR(synchronize_stream, true); + CHECK_PTR(synchronize_event, true); + CHECK_PTR(stream_wait_event, true); + + CHECK_PTR(device_memory_allocate, true); + CHECK_PTR(device_memory_deallocate, true); + CHECK_PTR(host_memory_allocate, false); + CHECK_PTR(host_memory_deallocate, false); + CHECK_PTR(unified_memory_allocate, false); + CHECK_PTR(unified_memory_deallocate, false); + CHECK_PTR(memory_copy_h2d, true); + CHECK_PTR(memory_copy_d2h, true); + CHECK_PTR(memory_copy_d2d, true); + CHECK_PTR(memory_copy_p2p, false); + CHECK_PTR(async_memory_copy_h2d, false); + CHECK_PTR(async_memory_copy_d2h, false); + CHECK_PTR(async_memory_copy_d2d, false); + CHECK_PTR(async_memory_copy_p2p, false); + + CHECK_PTR(get_device_count, true); + CHECK_PTR(get_device_list, true); + CHECK_PTR(device_memory_stats, true); + + CHECK_PTR(device_min_chunk_size, true); + CHECK_PTR(device_max_chunk_size, false); + CHECK_PTR(device_max_alloc_size, false); + CHECK_PTR(device_extra_padding_size, false); + CHECK_PTR(get_compute_capability, false); + CHECK_PTR(get_runtime_version, false); + CHECK_PTR(get_driver_version, false); + + return true; +#undef CHECK_PTR +} + +typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params); + +bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params, + std::unique_ptr device_interface, + void* dso_handle) { + if (ValidCustomCustomRuntimeParams(&runtime_params)) { + auto device = + std::make_unique(runtime_params.device_type, 255, true, + std::move(device_interface), dso_handle); + if (false == DeviceManager::Register(std::move(device))) { + LOG(WARNING) << "Skip this library. Register failed!!! there may be a " + "Custom Runtime with the same name."; + return false; + } + } else { + LOG(WARNING) + << "Skip this library. Wrong parameters!!! please check the version " + "compatibility between PaddlePaddle and Custom Runtime."; + return false; + } + return true; +} + +bool LoadCustomRuntimeLib(void* dso_handle) { + CustomRuntimeParams runtime_params; + std::memset(&runtime_params, 0, sizeof(CustomRuntimeParams)); + runtime_params.size = sizeof(CustomRuntimeParams); + auto device_interface = std::make_unique(); + runtime_params.interface = device_interface.get(); + std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface)); + runtime_params.interface->size = sizeof(C_DeviceInterface); + + RegisterDevicePluginFn init_plugin_fn = + reinterpret_cast(dlsym(dso_handle, "InitPlugin")); + if (!init_plugin_fn) { + LOG(WARNING) << "Skip this library. InitPlugin symbol not found."; + return false; + } + init_plugin_fn(&runtime_params); + if (runtime_params.device_type == nullptr) { + LOG(WARNING) + << "Skip this library. InitPlugin failed!!! please check the version " + "compatibility between PaddlePaddle and Custom Runtime."; + return false; + } + return LoadCustomRuntimeLib(runtime_params, std::move(device_interface), + dso_handle); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/fluid/platform/device/custom/custom_device_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a874ea221228ef016ad3bff60620f949582cf9e --- /dev/null +++ b/paddle/fluid/platform/device/custom/custom_device_test.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device/custom/fake_cpu_device.h" +#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/fluid/platform/device_context.h" + +void RegisterDevice() { + CustomRuntimeParams runtime_params; + runtime_params.size = sizeof(CustomRuntimeParams); + auto device_interface = std::make_unique(); + runtime_params.interface = device_interface.get(); + std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface)); + runtime_params.interface->size = sizeof(C_DeviceInterface); + + InitFakeCPUDevice(&runtime_params); + EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib( + runtime_params, std::move(device_interface), nullptr)); +} + +void InitDevice() { + RegisterDevice(); + EXPECT_GT(static_cast( + paddle::platform::DeviceManager::GetAllDeviceTypes().size()), + 0); + auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0); + auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place); + EXPECT_NE(device, nullptr); + + std::vector places; + auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes(); + for (auto dev_type : device_types) { + auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type); + for (auto dev_id : devices) { + places.push_back( + paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id)); + } + } + EXPECT_GT(static_cast(places.size()), 0); + + paddle::platform::DeviceContextPool::Init(places); +} + +void TestDeviceInterface(const paddle::platform::Place& place) { + std::cout << "TestDeviceInterface on " << place << std::endl; + if (paddle::platform::is_custom_place(place)) { + auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place); + auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place); + auto p1 = device->MemoryAllocate( + paddle::platform::DeviceManager::GetMinChunkSize(place)); + EXPECT_NE(p1, nullptr); + + paddle::platform::DeviceManager::SetDevice(place); + auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type); + EXPECT_EQ(dev_id, place.GetDeviceId()); + } +} + +void TestTensorMutableData(const paddle::platform::Place& place) { + std::cout << "TestTensorInitialization on " << place << std::endl; + paddle::framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(paddle::framework::make_ddim({1, 2, 3}), + place); + auto p1_holder = src_tensor.Holder(); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(paddle::framework::make_ddim({3, 1024}), + place); + auto p2_holder = src_tensor.Holder(); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1_holder.get(), p2_holder.get()); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(paddle::framework::make_ddim({2, 2, 3}), + place); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(paddle::framework::make_ddim({2, 2}), + place); + EXPECT_EQ(p1, p2); +} + +void TestTensorShareDataWith(const paddle::platform::Place& place) { + std::cout << "TestTensorShareDataWith on " << place << std::endl; + paddle::framework::Tensor src_tensor; + paddle::framework::Tensor dst_tensor; + src_tensor.mutable_data(paddle::framework::make_ddim({2, 3, 4}), place); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); +} + +void TestTensorUtils(const paddle::platform::Place& place) { + if (paddle::platform::is_custom_place(place) == false) { + return; + } + paddle::framework::Tensor src_tensor; + paddle::framework::Tensor gpu_tensor; + paddle::framework::Tensor dst_tensor; + + int* src_ptr = src_tensor.mutable_data( + paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + paddle::platform::CustomDeviceContext gpu_ctx(place); + paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor); +#if 0 + // GPU Tensor to CPU Tensor + auto cpu_place = new paddle::platform::CPUPlace(); + paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* dst_ptr = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + // Copy the same tensor + paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + + paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + EXPECT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); +#endif +} + +TEST(CustomDevice, Tensor) { + InitDevice(); + auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes(); + for (const auto& dev_type : dev_types) { + std::cout << "Test on " << dev_type << std::endl; + EXPECT_GT(static_cast( + paddle::platform::DeviceManager::GetDeviceCount(dev_type)), + 0); + auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type); + + TestDeviceInterface(place); + TestTensorMutableData(place); + TestTensorShareDataWith(place); + TestTensorUtils(place); + } +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h new file mode 100644 index 0000000000000000000000000000000000000000..fbdb4627aba2662a2a12cc933a3a4c6e61aa55d5 --- /dev/null +++ b/paddle/fluid/platform/device/custom/enforce_custom.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/device_ext.h" + +namespace paddle { +namespace platform { +namespace details { +template +struct CustomDeviceStatusType {}; + +#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \ + template <> \ + struct CustomDeviceStatusType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS); +} // namespace details + +inline std::string build_custom_device_error_msg(C_Status stat) { + std::ostringstream sout; + sout << " CustomDevice error, the error code is : " << stat << ". "; + return sout.str(); +} + +#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CustomDeviceStatusType< \ + __CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + ::paddle::platform::build_custom_device_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) +} // namespace platform +} // namespace paddle +#endif // PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/fluid/platform/device/custom/fake_cpu_device.h new file mode 100644 index 0000000000000000000000000000000000000000..c6d8ade4b08597b2c17e5df9dc333c3c4f70d69e --- /dev/null +++ b/paddle/fluid/platform/device/custom/fake_cpu_device.h @@ -0,0 +1,185 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/platform/device/device_ext.h" + +constexpr size_t global_total_memory = 1024 * 1024UL; +static size_t global_free_memory = global_total_memory; + +C_Status Init() { return C_SUCCESS; } + +C_Status InitDevice(const C_Device device) { return C_SUCCESS; } + +C_Status SetDevice(const C_Device device) { return C_SUCCESS; } + +C_Status GetDevice(const C_Device device) { + device->id = 0; + return C_SUCCESS; +} + +C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; } + +C_Status Finalize() { return C_SUCCESS; } + +C_Status GetDevicesCount(size_t *count) { + *count = 1; + return C_SUCCESS; +} + +C_Status GetDevicesList(size_t *device) { + *device = 0; + return C_SUCCESS; +} + +C_Status MemCpy(const C_Device device, void *dst, const void *src, + size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst, + const void *src, size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status Allocate(const C_Device device, void **ptr, size_t size) { + if (global_free_memory >= size) { + *ptr = malloc(size); + global_free_memory -= size; + return C_SUCCESS; + } else { + *ptr = nullptr; + return C_FAILED; + } +} + +C_Status Deallocate(const C_Device device, void *ptr, size_t size) { + free(ptr); + global_free_memory += size; + return C_SUCCESS; +} + +C_Status CreateStream(const C_Device device, C_Stream *stream) { + return C_SUCCESS; +} + +C_Status DestroyStream(const C_Device device, C_Stream stream) { + return C_SUCCESS; +} + +C_Status CreateEvent(const C_Device device, C_Event *event) { + return C_SUCCESS; +} + +C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) { + return C_SUCCESS; +} + +C_Status DestroyEvent(const C_Device device, C_Event event) { + return C_SUCCESS; +} + +C_Status SyncDevice(const C_Device device) { return C_SUCCESS; } + +C_Status SyncStream(const C_Device device, C_Stream stream) { + return C_SUCCESS; +} + +C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; } + +C_Status StreamWaitEvent(const C_Device device, C_Stream stream, + C_Event event) { + return C_SUCCESS; +} + +C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; } + +C_Status DeviceMemStats(const C_Device device, size_t *total_memory, + size_t *free_memory) { + *total_memory = global_total_memory; + *free_memory = global_free_memory; + return C_SUCCESS; +} + +C_Status DeviceMinChunkSize(const C_Device device, size_t *size) { + *size = 4 * 1024; + return C_SUCCESS; +} + +C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) { + *size = 64 * 1024; + return C_SUCCESS; +} + +C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) { + *size = global_total_memory * 0.95; + return C_SUCCESS; +} + +#define DEVICE_TYPE "FakeCPU" +#define SUB_DEVICE_TYPE "V100" + +void InitFakeCPUDevice(CustomRuntimeParams *params) { + params->device_type = const_cast(DEVICE_TYPE); + params->sub_device_type = const_cast(SUB_DEVICE_TYPE); + params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION; + params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION; + params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION; + + memset(reinterpret_cast(params->interface), 0, + sizeof(C_DeviceInterface)); + + params->interface->initialize = Init; + params->interface->finalize = Finalize; + + params->interface->init_device = InitDevice; + params->interface->set_device = SetDevice; + params->interface->get_device = GetDevice; + params->interface->deinit_device = DestroyDevice; + + params->interface->create_stream = CreateStream; + params->interface->destroy_stream = DestroyStream; + + params->interface->create_event = CreateEvent; + params->interface->destroy_event = DestroyEvent; + params->interface->record_event = RecordEvent; + + params->interface->synchronize_device = SyncDevice; + params->interface->synchronize_stream = SyncStream; + params->interface->synchronize_event = SyncEvent; + params->interface->stream_wait_event = StreamWaitEvent; + + params->interface->memory_copy_h2d = MemCpy; + params->interface->memory_copy_d2d = MemCpy; + params->interface->memory_copy_d2h = MemCpy; + params->interface->async_memory_copy_h2d = AsyncMemCpy; + params->interface->async_memory_copy_d2d = AsyncMemCpy; + params->interface->async_memory_copy_d2h = AsyncMemCpy; + params->interface->device_memory_allocate = Allocate; + params->interface->host_memory_allocate = Allocate; + params->interface->unified_memory_allocate = Allocate; + params->interface->device_memory_deallocate = Deallocate; + params->interface->host_memory_deallocate = Deallocate; + params->interface->unified_memory_deallocate = Deallocate; + + params->interface->get_device_count = GetDevicesCount; + params->interface->get_device_list = GetDevicesList; + params->interface->device_memory_stats = DeviceMemStats; + + params->interface->device_max_chunk_size = DeviceMaxChunkSize; + params->interface->device_min_chunk_size = DeviceMinChunkSize; + params->interface->device_max_alloc_size = DeviceMaxAllocSize; +} diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/fluid/platform/device/device_base.cc new file mode 100644 index 0000000000000000000000000000000000000000..6234c9612687e507acd2642ef1d39cc0f8da4539 --- /dev/null +++ b/paddle/fluid/platform/device/device_base.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/device_base.h" +#include "gflags/gflags.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); + +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +namespace paddle { +namespace platform { + +#define INTERFACE_UNIMPLEMENT \ + PADDLE_THROW(platform::errors::Unimplemented( \ + "%s is not implemented on %s device.", __func__, Type())); + +// info +size_t DeviceInterface::GetComputeCapability() { + VLOG(10) << Type() + " get compute capability " << 0; + return 0; +} + +size_t DeviceInterface::GetRuntimeVersion() { + VLOG(10) << Type() + " get runtime version " << 0; + return 0; +} + +size_t DeviceInterface::GetDriverVersion() { + VLOG(10) << Type() + " get driver version " << 0; + return 0; +} + +// device manage +void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::SynchronizeDevice(size_t dev_id) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } + +int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; } + +// stream manage +void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream, + const stream::Stream::Priority& priority, + const stream::Stream::Flag& flag) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::SynchronizeStream(size_t dev_id, + const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; + return true; +} + +void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream, + stream::Stream::Callback* callback) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::StreamWaitEvent(size_t dev_id, + const stream::Stream* stream, + const event::Event* event) { + INTERFACE_UNIMPLEMENT; +} + +// event manage +void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event, + event::Event::Flag flags) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event, + const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::SynchronizeEvent(size_t dev_id, + const event::Event* event) { + INTERFACE_UNIMPLEMENT; +} + +bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) { + INTERFACE_UNIMPLEMENT; + return true; +} + +// memery manage +void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst, + size_t src_id, const void* src, size_t size, + const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) { + INTERFACE_UNIMPLEMENT; + return nullptr; +} + +void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) { + INTERFACE_UNIMPLEMENT; + return nullptr; +} + +void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr, + size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) { + INTERFACE_UNIMPLEMENT; + return nullptr; +} + +void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr, + size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value, + size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) { + INTERFACE_UNIMPLEMENT; +} + +size_t DeviceInterface::GetMinChunkSize(size_t dev_id) { + INTERFACE_UNIMPLEMENT; +} + +size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) { + size_t available_to_alloc = AvailableAllocSize(dev_id); + PADDLE_ENFORCE_GT(available_to_alloc, 0, + platform::errors::ResourceExhausted( + "Not enough available %s memory.", Type())); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes, + platform::errors::ResourceExhausted( + "Not enough available %s memory.", Type())); + return alloc_bytes; +} + +size_t DeviceInterface::AvailableAllocSize(size_t dev_id) { + size_t total = 0; + size_t available = 0; + MemoryStats(dev_id, &total, &available); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = GetMinChunkSize(dev_id); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + return available_to_alloc; +} + +size_t DeviceInterface::GetInitAllocSize(size_t dev_id) { + size_t init_alloc_size = AllocSize(dev_id, false); + VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M"; + return init_alloc_size; +} + +size_t DeviceInterface::GetReallocSize(size_t dev_id) { + size_t realloc_size = AllocSize(dev_id, true); + VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M"; + return realloc_size; +} + +size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) { + size_t max_alloc_size = + std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id)); + VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M"; + return max_alloc_size; +} + +size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) { + size_t max_chunk_size = GetMaxAllocSize(dev_id); + VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; +} + +size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) { + VLOG(10) << Type() + " extra padding size " << 0; + return 0; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/fluid/platform/device/device_base.h new file mode 100644 index 0000000000000000000000000000000000000000..d70b02be80eacd9d492b8a8d40c0a074dfe9c6e3 --- /dev/null +++ b/paddle/fluid/platform/device/device_base.h @@ -0,0 +1,166 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" + +namespace paddle { +namespace platform { + +class DeviceInterface { // Driver / Runtime + public: + DeviceInterface(const std::string& type, uint8_t priority, bool is_custom) + : type_(type), priority_(priority), is_custom_(is_custom) {} + uint8_t Priority() { return priority_; } + std::string Type() { return type_; } + bool IsCustom() { return is_custom_; } + + virtual ~DeviceInterface() {} + + // Info + virtual size_t GetComputeCapability(); + + virtual size_t GetRuntimeVersion(); + + virtual size_t GetDriverVersion(); + + // Platform + //! Initialize + virtual void Initialize(); + + //! Finalize + virtual void Finalize(); + + // Device + virtual size_t GetDeviceCount() = 0; + virtual std::vector GetDeviceList() = 0; + + //! Wait for compute device to finish. + virtual void SynchronizeDevice(size_t dev_id); + + //! Initialize device. + virtual void InitDevice(size_t dev_id); + + //! Deinitialize device. + virtual void DeInitDevice(size_t dev_id); + + // ! Set device to be used. + virtual void SetDevice(size_t dev_id); + + // ! Returns which device is currently being used. + virtual int GetDevice(); + + // Stream + // ! Create an asynchronous stream + virtual void CreateStream( + size_t dev_id, stream::Stream* stream, + const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, + const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag); + + // ! Destroys an asynchronous stream. + virtual void DestroyStream(size_t dev_id, stream::Stream* stream); + + // ! Waits for stream tasks to complete. + virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream); + + // ! Queries an asynchronous stream for completion status. + virtual bool QueryStream(size_t dev_id, const stream::Stream* stream); + + // ! Add a callback to a compute stream. + virtual void AddCallback(size_t dev_id, stream::Stream* stream, + stream::Stream::Callback* callback); + + // Event + // ! Create an event. + virtual void CreateEvent(size_t dev_id, event::Event* event, + event::Event::Flag flags); + + // ! Destroy an event. + virtual void DestroyEvent(size_t dev_id, event::Event* event); + + // ! Records an event. + virtual void RecordEvent(size_t dev_id, const event::Event* event, + const stream::Stream* stream); + + // ! Waits for event to complete. + virtual void SynchronizeEvent(size_t dev_id, const event::Event* event); + // ! Queries an event for completion status. + virtual bool QueryEvent(size_t dev_id, const event::Event* event); + + // ! Make a compute stream wait on an event + virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream, + const event::Event* event); + + // Memory + virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, + size_t size, + const stream::Stream* stream = nullptr); + + virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, + size_t size, + const stream::Stream* stream = nullptr); + + virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, + size_t size, + const stream::Stream* stream = nullptr); + + virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id, + const void* src, size_t size, + const stream::Stream* stream = nullptr); + + virtual void* MemoryAllocate(size_t dev_id, size_t size); + + virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size); + + virtual void* MemoryAllocateHost(size_t dev_id, size_t size); + + virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size); + + virtual void* MemoryAllocateUnified(size_t dev_id, size_t size); + + virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size); + + virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size); + + virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free); + + virtual size_t GetMinChunkSize(size_t dev_id); + + virtual size_t GetInitAllocSize(size_t dev_id); + + virtual size_t GetReallocSize(size_t dev_id); + + virtual size_t GetMaxAllocSize(size_t dev_id); + + virtual size_t GetMaxChunkSize(size_t dev_id); + + virtual size_t GetExtraPaddingSize(size_t dev_id); + + private: + const std::string type_; + const uint8_t priority_; + const bool is_custom_; + + size_t AllocSize(size_t dev_id, bool realloc); + + size_t AvailableAllocSize(size_t dev_id); +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/fluid/platform/device/device_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..d1e1340f74b7741f867b85d7ab0b1e42c9621a47 --- /dev/null +++ b/paddle/fluid/platform/device/device_ext.h @@ -0,0 +1,497 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#if !defined(_WIN32) && !defined(__APPLE__) +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0 +#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1 +#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1 + +typedef enum { + C_SUCCESS = 0, // success + C_WARNING, // results may not meet expectation (such as an asynchronous + // interface is actually synchronous) + C_FAILED, // resource exhausted/query failed + C_ERROR, // invalid argument/wrong usage/uninitialized + C_INTERNAL_ERROR // plugin error +} C_Status; + +typedef struct C_Device_st { int id; } * C_Device; + +typedef struct C_Stream_st* C_Stream; + +typedef struct C_Event_st* C_Event; + +typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data, + C_Status* status); + +struct C_DeviceInterface { + // Core fill it and plugin must to check it + size_t size; + + /////////////////////// + // device manage api // + /////////////////////// + + /** + * @brief Initialize hardware + * + */ + C_Status (*initialize)(); + + /** + * @brief Deinitialize hardware + * + */ + C_Status (*finalize)(); + + /** + * @brief Initialize device + * + * @param[C_Device] device Core fill it with a logical id, and then plugin + * must replace it with a physical id + */ + C_Status (*init_device)(const C_Device device); + + /** + * @brief Set current device + * + * @param[C_Device] device Core fill it with a physical id + */ + C_Status (*set_device)(const C_Device device); + + /** + * @brief Get current device + * + * @param[C_Device] device Plugin fill it with a physical id + */ + C_Status (*get_device)(const C_Device device); + + /** + * @brief Deinitialize device + * + * @param[C_Device] device Core fill it with a physical id + */ + C_Status (*deinit_device)(const C_Device device); + + /** + * @brief Create a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream*] stream Plugin create a stream and fill it + */ + C_Status (*create_stream)(const C_Device device, C_Stream* stream); + + /** + * @brief Destroy a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + */ + C_Status (*destroy_stream)(const C_Device device, C_Stream stream); + + /** + * @brief Query a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + */ + C_Status (*query_stream)(const C_Device device, C_Stream stream); + + /** + * @brief Add a callback to stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[C_Callback] callback + * @param[void*] user_data + */ + C_Status (*stream_add_callback)(const C_Device device, C_Stream stream, + C_Callback callback, void* user_data); + + /** + * @brief Create an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event*] event Plugin create an event and fill it + */ + C_Status (*create_event)(const C_Device device, C_Event* event); + + /** + * @brief Record an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[C_Event] event + */ + C_Status (*record_event)(const C_Device device, C_Stream stream, + C_Event event); + + /** + * @brief Destroy an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event] event + */ + C_Status (*destroy_event)(const C_Device device, C_Event event); + + /** + * @brief Query an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event] event + */ + C_Status (*query_event)(const C_Device device, C_Event event); + + /** + * @brief Synchronize a device + * + * @param[C_Device] device Core fill it with a physical id + */ + C_Status (*synchronize_device)(const C_Device device); + + /** + * @brief Synchronize a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + */ + C_Status (*synchronize_stream)(const C_Device device, C_Stream stream); + + /** + * @brief Synchronize an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event] event + */ + C_Status (*synchronize_event)(const C_Device device, C_Event event); + + /** + * @brief Make a stream wait on an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[C_Event] event + */ + C_Status (*stream_wait_event)(const C_Device device, C_Stream stream, + C_Event event); + + void* reserved_dev_api[8]; + + /////////////////////// + // memory manage api // + /////////////////////// + + /** + * @brief Device memory allocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void**] ptr Plugin allocate an address and fill it + * @param[size_t] size + */ + C_Status (*device_memory_allocate)(const C_Device device, void** ptr, + size_t size); + + /** + * @brief Device memory deallocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[size_t] size + */ + C_Status (*device_memory_deallocate)(const C_Device device, void* ptr, + size_t size); + + /** + * @brief Device memory set + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[unsigned char] value + * @param[size_t] size + */ + C_Status (*device_memory_set)(const C_Device device, void* ptr, + unsigned char value, size_t size); + + /** + * @brief Host memory allocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void**] ptr Plugin allocate an address and fill it + * @param[size_t] size + */ + C_Status (*host_memory_allocate)(const C_Device device, void** ptr, + size_t size); + + /** + * @brief Host memory deallocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[size_t] size + */ + C_Status (*host_memory_deallocate)(const C_Device device, void* ptr, + size_t size); + + /** + * @brief Unified memory allocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void**] ptr Plugin allocate an address and fill it + * @param[size_t] size + */ + C_Status (*unified_memory_allocate)(const C_Device device, void** ptr, + size_t size); + + /** + * @brief Unified memory deallocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[size_t] size + */ + C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr, + size_t size); + + /** + * @brief Memory copy from host to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src, + size_t size); + + /** + * @brief Memory copy from device to host + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src, + size_t size); + + /** + * @brief Memory copy from device to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src, + size_t size); + + /** + * @brief Peer memory copy from device to device + * + * @param[C_Device] dst_device Core fill it with a physical id + * @param[C_Device] src_device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_p2p)(const C_Device dst_device, + const C_Device src_device, void* dst, + const void* src, size_t size); + + /** + * @brief Asynchonrize memory copy from host to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream, + void* dst, const void* src, size_t size); + + /** + * @brief Asynchonrize memory copy from device to host + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream, + void* dst, const void* src, size_t size); + + /** + * @brief Asynchonrize memory copy from device to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream, + void* dst, const void* src, size_t size); + + /** + * @brief Peer asynchonrize memory copy from host to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_p2p)(const C_Device dst_device, + const C_Device src_device, C_Stream stream, + void* dst, const void* src, size_t size); + + void* reserved_mem_api[8]; + + ////////////// + // info api // + ////////////// + + /** + * @brief Get visible device count + * + * @param[size_t*] count Plugin fill it + */ + C_Status (*get_device_count)(size_t* count); + + /** + * @brief Get visible device list + * + * @param[size_t*] devices Plugin fill it + */ + C_Status (*get_device_list)(size_t* devices); + + /** + * @brief Device memory statistic + * + * @param[C_Device] device Core fill it with a physical id + * @param[size_t*] total_memory + * @param[size_t*] free_memory + * @param[size_t*] used_memory + */ + C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory, + size_t* free_memory); + + /** + * @brief Device minimum chunk size + * + * @param[size_t*] count + */ + C_Status (*device_min_chunk_size)(const C_Device device, size_t* count); + + /** + * @brief Device maximum chunk size + * + * @param[size_t*] count + */ + C_Status (*device_max_chunk_size)(const C_Device device, size_t* count); + + /** + * @brief Device maximum alloc size + * + * @param[size_t*] count + */ + C_Status (*device_max_alloc_size)(const C_Device device, size_t* count); + + /** + * @brief Device extra padding size + * + * @param[size_t*] size + */ + C_Status (*device_extra_padding_size)(const C_Device device, size_t* size); + + /** + * @brief Device initial allocated size + * + * @param[size_t*] size + */ + C_Status (*device_init_alloc_size)(const C_Device device, size_t* size); + + /** + * @brief Device reallocated size + * + * @param[size_t*] size + */ + C_Status (*device_realloc_size)(const C_Device device, size_t* size); + + /** + * @brief Get compute capability + * + * @param[size_t*] compute_capability + */ + C_Status (*get_compute_capability)(size_t* compute_capability); + + /** + * @brief Get runtime version + * + * @param[size_t*] version + */ + C_Status (*get_runtime_version)(size_t* version); + + /** + * @brief Get driver version + * + * @param[size_t*] version + */ + C_Status (*get_driver_version)(size_t* version); + + void* reserved_info_api[8]; + + /////////////// + // other api // + /////////////// + + void* reserved_other_api[8]; +}; + +struct CustomRuntimeVersion { + size_t major, minor, patch; +}; + +struct CustomRuntimeParams { + // Core fill it and plugin must to check it + size_t size; + // Plugin fill it + C_DeviceInterface* interface; + // Plugin fill it and Core will to check it + CustomRuntimeVersion version; + // Plugin fill it + char* device_type; + // Plugin fill it + char* sub_device_type; + + char reserved[32]; +}; + +// Plugin implement it and fill CustomRuntimeParams +void InitPlugin(CustomRuntimeParams*); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/fluid/platform/device/device_guard.cc new file mode 100644 index 0000000000000000000000000000000000000000..55d8b9dc6a9a58dda5ae8192709e6858da878da7 --- /dev/null +++ b/paddle/fluid/platform/device/device_guard.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/device_guard.h" + +namespace paddle { +namespace platform { +// Even this source file does not contains any code, it is better to keep this +// source file for cmake dependency. +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/fluid/platform/device/device_guard.h new file mode 100644 index 0000000000000000000000000000000000000000..638e9c984b4d25e474fd5949e9fdc5df98a344ef --- /dev/null +++ b/paddle/fluid/platform/device/device_guard.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/platform/device/device_manager.h" + +namespace paddle { +namespace platform { + +class DeviceGuard { + public: + explicit inline DeviceGuard(const Place& place) + : dev_type_(PlaceHelper::GetDeviceType(place)) { + prev_id = DeviceManager::GetDevice(dev_type_); + cur_id = PlaceHelper::GetDeviceId(place); + + if (cur_id != prev_id) { + DeviceManager::SetDevice(dev_type_, cur_id); + } + } + + inline ~DeviceGuard() { + if (cur_id != prev_id) { + DeviceManager::SetDevice(dev_type_, prev_id); + } + } + + DeviceGuard(const DeviceGuard& o) = delete; + DeviceGuard& operator=(const DeviceGuard& o) = delete; + + private: + size_t prev_id, cur_id; + std::string dev_type_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/fluid/platform/device/device_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..38dcb721b1faeac8bc14b49cf7f0957406d4c590 --- /dev/null +++ b/paddle/fluid/platform/device/device_manager.cc @@ -0,0 +1,420 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/device_manager.h" + +#if !defined(_WIN32) +#include +#else + +#endif + +#include +#include + +namespace paddle { +namespace platform { + +void Device::CreateStream(stream::Stream* stream, + const stream::Stream::Priority& priority, + const stream::Stream::Flag& flag) { + impl_->CreateStream(dev_id_, stream, priority, flag); +} + +void Device::DestroyStream(stream::Stream* stream) { + impl_->DestroyStream(dev_id_, stream); +} + +void Device::SynchronizeStream(const stream::Stream* stream) { + impl_->SynchronizeStream(dev_id_, stream); +} + +bool Device::QueryStream(const stream::Stream* stream) { + return impl_->QueryStream(dev_id_, stream); +} + +void Device::AddCallback(stream::Stream* stream, + stream::Stream::Callback* callback) { + impl_->AddCallback(dev_id_, stream, callback); +} + +void Device::CreateEvent(event::Event* event, event::Event::Flag flags) { + impl_->CreateEvent(dev_id_, event, flags); +} + +void Device::DestroyEvent(event::Event* event) { + impl_->DestroyEvent(dev_id_, event); +} + +void Device::RecordEvent(const event::Event* event, + const stream::Stream* stream) { + impl_->RecordEvent(dev_id_, event, stream); +} + +void Device::SynchronizeEvent(const event::Event* event) { + impl_->SynchronizeEvent(dev_id_, event); +} + +bool Device::QueryEvent(const event::Event* event) { + return impl_->QueryEvent(dev_id_, event); +} + +void Device::StreamWaitEvent(const stream::Stream* stream, + const event::Event* event) { + impl_->StreamWaitEvent(dev_id_, stream, event); +} + +void Device::MemoryCopyH2D(void* dst, const void* src, size_t size, + const stream::Stream* stream) { + impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream); +} + +void Device::MemoryCopyD2H(void* dst, const void* src, size_t size, + const stream::Stream* stream) { + impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream); +} + +void Device::MemoryCopyD2D(void* dst, const void* src, size_t size, + const stream::Stream* stream) { + impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream); +} + +void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream); +} + +void* Device::MemoryAllocate(size_t size) { + return impl_->MemoryAllocate(dev_id_, size); +} + +void Device::MemoryDeallocate(void* ptr, size_t size) { + impl_->MemoryDeallocate(dev_id_, ptr, size); +} + +void* Device::MemoryAllocateHost(size_t size) { + return impl_->MemoryAllocateHost(dev_id_, size); +} + +void Device::MemoryDeallocateHost(void* ptr, size_t size) { + impl_->MemoryDeallocateHost(dev_id_, ptr, size); +} + +void* Device::MemoryAllocateUnified(size_t size) { + return impl_->MemoryAllocateUnified(dev_id_, size); +} + +void Device::MemoryDeallocateUnified(void* ptr, size_t size) { + impl_->MemoryDeallocateUnified(dev_id_, ptr, size); +} + +void Device::MemorySet(void* ptr, uint8_t value, size_t size) { + impl_->MemorySet(dev_id_, ptr, value, size); +} + +std::string Device::Type() { return impl_->Type(); } + +static pten::RWLock _global_device_manager_rw_lock; + +bool DeviceManager::Register(std::unique_ptr device_impl) { + pten::AutoWRLock lock(&_global_device_manager_rw_lock); + VLOG(4) << "Register Device - " << device_impl->Type(); + auto device_type = device_impl->Type(); + auto& dev_impl_map = Instance().device_impl_map_; + auto& dev_map = Instance().device_map_; + + if (dev_impl_map.find(device_type) == dev_impl_map.end()) { + dev_impl_map.insert( + std::pair>( + device_type, std::move(device_impl))); + auto& dev_impl = dev_impl_map[device_type]; + auto& dev_vec = dev_map[device_type]; + VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount(); + for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) { + dev_vec.emplace_back(new Device(i, dev_impl.get())); + } + } else { + auto& plat = dev_impl_map[device_type]; + if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) { + dev_impl_map[device_type] = std::move(device_impl); + auto& dev_impl = dev_impl_map[device_type]; + auto& dev_vec = dev_map[device_type]; + dev_vec.clear(); + VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount(); + for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) { + dev_vec.emplace_back(new Device(i, dev_impl.get())); + } + } else { + return false; + } + } + return true; +} + +DeviceInterface* DeviceManager::GetDeviceInterfaceWithType( + const std::string& device_type) { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + + auto& dev_impl_map = Instance().device_impl_map_; + if (dev_impl_map.find(device_type) != dev_impl_map.end()) { + return dev_impl_map.at(device_type).get(); + } else { + LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n"; + PADDLE_THROW( + platform::errors::Fatal("Unregistered device type %s.", device_type)); + return nullptr; + } +} + +Device* DeviceManager::GetDeviceWithPlace(const Place& place) { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + + auto& dev_map = Instance().device_map_; + auto dev_type = PlaceHelper::GetDeviceType(place); + auto dev_id = PlaceHelper::GetDeviceId(place); + PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(), + platform::errors::NotFound( + "Unable to find Device with type %s.", dev_type)); + auto& dev_vec = dev_map[dev_type]; + PADDLE_ENFORCE_LT( + dev_id, dev_vec.size(), + platform::errors::OutOfRange( + "The visible devices count of type %s is %d, but dev_id is %d.", + dev_type, dev_vec.size(), dev_id)); + return dev_vec[dev_id].get(); +} + +std::vector DeviceManager::GetAllDeviceTypes() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + devices.push_back(iter->first); + } + return devices; +} + +std::vector DeviceManager::GetAllCustomDeviceTypes() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + if (iter->second->IsCustom()) { + devices.push_back(iter->first); + } + } + return devices; +} + +std::vector DeviceManager::GetAllDeviceList() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + size_t device_count = iter->second->GetDeviceCount(); + std::string dev_type = iter->second->Type(); + if (device_count == 1) { + devices.push_back(dev_type); + } else { + for (size_t i = 0; i < device_count; ++i) { + devices.push_back(dev_type + ":" + std::to_string(i)); + } + } + } + return devices; +} + +std::vector DeviceManager::GetAllCustomDeviceList() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + size_t device_count = iter->second->GetDeviceCount(); + std::string dev_type = iter->second->Type(); + if (iter->second->IsCustom()) { + if (device_count == 1) { + devices.push_back(dev_type); + } else { + for (size_t i = 0; i < device_count; ++i) { + devices.push_back(dev_type + ":" + std::to_string(i)); + } + } + } + } + return devices; +} + +bool DeviceManager::HasDeviceType(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl != nullptr; +} + +bool DeviceManager::IsCustom(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->IsCustom(); +} + +void DeviceManager::Initialize(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->Initialize(); +} + +void DeviceManager::Finalize(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->Finalize(); +} + +void DeviceManager::SynchronizeDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->SynchronizeDevice(device_id); +} + +void DeviceManager::InitDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->InitDevice(device_id); +} + +void DeviceManager::DeInitDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->DeInitDevice(device_id); +} + +void DeviceManager::SetDevice(const std::string& device_type, + size_t device_id) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->SetDevice(device_id); +} + +void DeviceManager::SetDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + DeviceManager::SetDevice(device_type, device_id); +} + +int DeviceManager::GetDevice(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetDevice(); +} + +size_t DeviceManager::GetMinChunkSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetMinChunkSize(device_id); +} + +size_t DeviceManager::GetMaxChunkSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetMaxChunkSize(device_id); +} + +size_t DeviceManager::GetMaxAllocSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetMaxAllocSize(device_id); +} + +size_t DeviceManager::GetInitAllocSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetInitAllocSize(device_id); +} + +size_t DeviceManager::GetReallocSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetReallocSize(device_id); +} + +size_t DeviceManager::GetExtraPaddingSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetExtraPaddingSize(device_id); +} + +void DeviceManager::MemoryStats(const Place& place, size_t* total, + size_t* free) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->MemoryStats(device_id, total, free); +} + +size_t DeviceManager::GetDeviceCount(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetDeviceCount(); +} + +std::vector DeviceManager::GetDeviceList( + const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetDeviceList(); +} + +DeviceManager& DeviceManager::Instance() { + static DeviceManager platform_manager; + return platform_manager; +} + +std::vector ListAllLibraries(const std::string& library_dir) { + std::vector libraries; + std::regex express(".*\\.so"); + std::match_results results; + DIR* dir = nullptr; + dirent* ptr = nullptr; + + dir = opendir(library_dir.c_str()); + if (dir == nullptr) { + VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed"; + } else { + while ((ptr = readdir(dir)) != nullptr) { + std::string filename(ptr->d_name); + if (std::regex_match(filename.begin(), filename.end(), results, + express)) { + libraries.push_back(library_dir + '/' + filename); + VLOG(4) << "found CustomDevice library: " << libraries.back() + << std::endl; + } + } + closedir(dir); + } + + return libraries; +} + +bool LoadCustomDevice(const std::string& library_dir) { + std::vector libs = ListAllLibraries(library_dir); + for (const auto& lib_path : libs) { + auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW); + LoadCustomRuntimeLib(dso_handle); + } + return true; +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/fluid/platform/device/device_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..ad910605d987aed726c41ff242434979aa2bb058 --- /dev/null +++ b/paddle/fluid/platform/device/device_manager.h @@ -0,0 +1,186 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE + +#include "paddle/fluid/platform/device/device_base.h" +#include "paddle/fluid/platform/device/device_ext.h" +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/pten/backends/dynload/port.h" +#include "paddle/pten/core/utils/rw_lock.h" + +namespace paddle { +namespace platform { +class Device final { + public: + Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {} + + // Stream + // ! Create an asynchronous stream + void CreateStream( + stream::Stream* stream, const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, + const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag); + + // ! Destroys an asynchronous stream. + void DestroyStream(stream::Stream* stream); + + // ! Waits for stream tasks to complete. + void SynchronizeStream(const stream::Stream* stream); + + // ! Queries an asynchronous stream for completion status. + bool QueryStream(const stream::Stream* stream); + + // ! Add a callback to a compute stream. + void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback); + + // Event + // ! Create an event. + void CreateEvent(event::Event* event, event::Event::Flag flags); + + // ! Destroy an event. + void DestroyEvent(event::Event* event); + + // ! Records an event. + void RecordEvent(const event::Event* event, const stream::Stream* stream); + + // ! Waits for event to complete. + void SynchronizeEvent(const event::Event* event); + + // ! Queries an event for completion status. + bool QueryEvent(const event::Event* event); + + // ! Make a compute stream wait on an event + void StreamWaitEvent(const stream::Stream* stream, const event::Event* event); + + // Memory + void MemoryCopyH2D(void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr); + + void MemoryCopyD2H(void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr); + + void MemoryCopyD2D(void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr); + + void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src, + size_t size, const stream::Stream* stream = nullptr); + + void* MemoryAllocate(size_t size); + + void MemoryDeallocate(void* ptr, size_t size); + + void* MemoryAllocateHost(size_t size); + + void MemoryDeallocateHost(void* ptr, size_t size); + + void* MemoryAllocateUnified(size_t size); + + void MemoryDeallocateUnified(void* ptr, size_t size); + + void MemorySet(void* ptr, uint8_t value, size_t size); + + std::string Type(); + + private: + size_t dev_id_; + DeviceInterface* impl_; +}; + +class DeviceManager { + public: + static bool Register(std::unique_ptr device); + static bool RegisterPinnedDevice(DeviceInterface* device); + static Device* GetDeviceWithPlace(const Place& place); + static std::vector GetAllDeviceTypes(); + static std::vector GetAllCustomDeviceTypes(); + static std::vector GetAllDeviceList(); + static std::vector GetAllCustomDeviceList(); + static bool HasDeviceType(const std::string& device_type); + static bool IsCustom(const std::string& device_type); + + // platform & device + static void Initialize(const std::string& device_type); + + static void Finalize(const std::string& device_type); + + static void SynchronizeDevice(const Place& place); + + static void InitDevice(const Place& place); + + static void DeInitDevice(const Place& place); + + static void SetDevice(const std::string& device_type, size_t device_id); + + static void SetDevice(const Place& place); + + static int GetDevice(const std::string& device_type); + + static size_t GetMinChunkSize(const Place& place); + + static size_t GetMaxChunkSize(const Place& place); + + static size_t GetMaxAllocSize(const Place& place); + + static size_t GetInitAllocSize(const Place& place); + + static size_t GetReallocSize(const Place& place); + + static size_t GetExtraPaddingSize(const Place& place); + + static void MemoryStats(const Place& place, size_t* total, size_t* free); + + static size_t GetDeviceCount(const std::string& device_type); + + static std::vector GetDeviceList(const std::string& device_type); + + private: + DISABLE_COPY_AND_ASSIGN(DeviceManager); + DeviceManager() {} + static DeviceManager& Instance(); + static DeviceInterface* GetDeviceInterfaceWithType( + const std::string& device_type); + + std::unordered_map> + device_impl_map_; + std::unordered_map>> + device_map_; +}; + +bool LoadCustomRuntimeLib(void* dso_handle); + +bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params, + std::unique_ptr device_interface, + void* dso_handle); + +bool LoadCustomDevice(const std::string& library_path); + +class Registrar { + public: + template + explicit Registrar(DeviceT* device_ptr) { + DeviceManager::Register(std::unique_ptr(device_ptr)); + } + + void Touch() {} +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index 4f8bbb2d2689eb6ffee1119c6eb14ef27de7a2c8..ba3461d8c14871561b2d069f9350698306e22366 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -38,3 +38,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_IPU #include "paddle/fluid/platform/device/ipu/ipu_info.h" #endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/fluid/platform/device/custom/enforce_custom.h" +#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" +#endif diff --git a/paddle/fluid/platform/device/event.cc b/paddle/fluid/platform/device/event.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e6316ea16de020801a7afce6ad47f4b06eca022 --- /dev/null +++ b/paddle/fluid/platform/device/event.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/stream.h" + +namespace paddle { +namespace platform { +namespace event { + +event_t Event::raw_event() const { return event_; } + +void Event::set_event(event_t event) { event_ = event; } + +Event::Event(const Place& place, event_t event) + : place_(place), + device_(platform::DeviceManager::GetDeviceWithPlace(place)), + event_(event), + own_data_(false) {} + +Event::~Event() { Destroy(); } + +bool Event::Init(const Place& place, Flag flags) { + place_ = place; + DeviceGuard guard(place_); + device_->CreateEvent(this, flags); + VLOG(3) << "Init Event: " << event_ << ", place: " << place_ + << ", flag:" << static_cast(flags); + own_data_ = true; + return true; +} + +void Event::Destroy() { + if (own_data_) { + DeviceGuard guard(place_); + device_->DestroyEvent(this); + own_data_ = false; + } +} + +void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); } + +bool Event::Query() const { return device_->QueryEvent(this); } + +void Event::Synchonrize() const { device_->SynchronizeEvent(this); } + +const Place& Event::GetPlace() const { return place_; } + +} // namespace event +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/event.h b/paddle/fluid/platform/device/event.h new file mode 100644 index 0000000000000000000000000000000000000000..376d73eb66660fdcdc0b2412d5d5e1371145e634 --- /dev/null +++ b/paddle/fluid/platform/device/event.h @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +class Device; + +namespace stream { +class Stream; +} // namespace stream + +namespace event { +using event_t = void*; + +class Event { + public: + enum Flag { + Default = 0x0, + BlockingSync = 0x1, + DisableTiming = 0x2, + Interprocess = 0x4, + }; + + // For compatible + Event(const Place& place, event_t event); + ~Event(); + event_t raw_event() const; + void set_event(event_t event); + bool Init(const Place& place, Flag flags = Flag::Default); + void Destroy(); + void Record(const stream::Stream* stream); + bool Query() const; + void Synchonrize() const; + const Place& GetPlace() const; + + private: + DISABLE_COPY_AND_ASSIGN(Event); + Place place_; + Device* device_; + event_t event_; + bool own_data_ = true; +}; +} // namespace event + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index cd78a89088cc612c3fb43e489cfb7ef2e07cfcf3..58a25ae8d0e565b649b29863637fa9d000d524d3 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -16,8 +16,10 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/pten/core/enforce.h" namespace paddle { namespace platform { @@ -61,6 +63,19 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, static_cast(delta), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, + bfloat16 val, int delta, + int width) { +#if defined(PADDLE_CUDA_BF16) + return bfloat16(__shfl_down_sync(mask, static_cast(val), + static_cast(delta), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( unsigned mask, paddle::platform::complex val, int delta, int width) { diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 13ffc2396946c5819c9276cf474d96a8057c4094..63897bd6717408bff4bd4db5e739b3ba64316350 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -16,6 +16,7 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -59,6 +60,14 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, static_cast(delta), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, + bfloat16 val, int delta, + int width) { + return bfloat16(__shfl_down(static_cast(val), + static_cast(delta), width)); +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( unsigned mask, paddle::platform::complex val, int delta, int width) { diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/fluid/platform/device/stream.cc new file mode 100644 index 0000000000000000000000000000000000000000..7f867e5ee7737d45f26a1967a3112c7075843454 --- /dev/null +++ b/paddle/fluid/platform/device/stream.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/event.h" + +namespace paddle { +namespace platform { +namespace stream { + +Stream::~Stream() { Destroy(); } + +const stream_t& Stream::raw_stream() const { return stream_; } + +void Stream::set_stream(stream_t stream) { stream_ = stream; } + +// For compatiable +Stream::Stream(const Place& place, stream_t stream) + : place_(place), + device_(platform::DeviceManager::GetDeviceWithPlace(place)), + stream_(stream), + callback_manager_(new CallbackManager(this)), + own_data_(false) {} + +bool Stream::Init(const Place& place, const Priority& priority, + const Flag& flag) { + place_ = place; + device_ = platform::DeviceManager::GetDeviceWithPlace(place); + DeviceGuard guard(place_); + device_->CreateStream(this, priority, flag); + + callback_manager_.reset(new CallbackManager(this)); + VLOG(3) << "Init Stream: " << stream_ << ", place: " << place_ + << ", priority: " << static_cast(priority) + << ", flag:" << static_cast(flag); + own_data_ = true; + return true; +} + +void Stream::RecordEvent(event::Event* event, Callback callback) const { + callback(); + device_->RecordEvent(event, this); +} + +void Stream::RecordEvent(event::Event* event) const { + device_->RecordEvent(event, this); +} + +void Stream::WaitEvent(event::Event* event) const { + device_->StreamWaitEvent(this, event); +} + +void Stream::Wait() const { +#if !defined(_WIN32) + device_->SynchronizeStream(this); +#else + while (1) { + if (device_->QueryStream(this)) { + break; + } + } +#endif +} + +void Stream::WaitCallback() const { callback_manager_->Wait(); } + +void Stream::Destroy() { + if (own_data_) { + DeviceGuard guard(place_); + device_->DestroyStream(this); + own_data_ = false; + } +} + +bool Stream::Query() const { return device_->QueryStream(this); } + +void Stream::Synchronize() const { device_->SynchronizeStream(this); } + +const Place& Stream::GetPlace() const { return place_; } + +} // namespace stream +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/stream.h b/paddle/fluid/platform/device/stream.h new file mode 100644 index 0000000000000000000000000000000000000000..25cf705ee0951847bfda84b336d3579403e8ab37 --- /dev/null +++ b/paddle/fluid/platform/device/stream.h @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +class Device; + +namespace event { +class Event; +} // namespace event + +namespace stream { +using stream_t = void*; +class Stream { + public: + enum class Priority : uint8_t { + kNull = 0x0, + kHigh = 0x1, + kNormal = 0x2, + }; + + enum class Flag : uint8_t { + kDefaultFlag = 0x0, + kStreamNonBlocking = 0x1, + }; + + using Callback = std::function; + + Stream() = default; + // For compatiable + Stream(const Place& place, stream_t stream); + ~Stream(); + const stream_t& raw_stream() const; + void set_stream(stream_t stream); + bool Init(const Place& place, const Priority& priority = Priority::kNormal, + const Flag& flag = Flag::kDefaultFlag); + template + void AddCallback(Callback&& callback) const { + callback_manager_->AddCallback(callback); + } + void RecordEvent(event::Event* event, Callback callback) const; + void RecordEvent(event::Event* event) const; + void WaitEvent(event::Event* event) const; + void Wait() const; + void WaitCallback() const; + void Destroy(); + bool Query() const; + void Synchronize() const; + const Place& GetPlace() const; + + private: + DISABLE_COPY_AND_ASSIGN(Stream); + Place place_; + Device* device_; + stream_t stream_; + std::unique_ptr callback_manager_; + bool own_data_ = true; +}; + +} // namespace stream +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a0a853a2f059745b281d3651d39baf061edf1053..d448df0702aadd56157902b55b11c41496bcf484 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/expect.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -256,6 +257,15 @@ DeviceContextPool::DeviceContextPool( "NPUPinnedPlace is not supported. Please re-compile with " "WITH_ASCEND_CL " "option.")); +#endif + } else if (platform::is_custom_place(p)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + EmplaceDeviceContext(&device_contexts_, p); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CustomPlace is not supported. Please re-compile with " + "WITH_CUSTOM_DEVICE " + "option.")); #endif } } @@ -885,6 +895,24 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( return key_it->second; } +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) { + DeviceGuard guard(place_); + stream_.reset(new stream::Stream()); + stream_->Init(place_); +} + +CustomDeviceContext::~CustomDeviceContext() {} + +const Place& CustomDeviceContext::GetPlace() const { return place_; } + +void CustomDeviceContext::Wait() const { + // platform::RecordEvent record_event("NPUDeviceContext/wait"); + VLOG(4) << "CustomDevice context(" << this << ") Wait"; + stream_->Wait(); +} #endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 80dcf6d2ec23cea4f375f54d5d9f1b6e24f382cb..1d51383f6833b584f77bce9e865ad5d229590421 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -70,6 +70,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/device/npu/npu_stream.h" #endif + +#include "paddle/fluid/platform/device/device_ext.h" +#include "paddle/fluid/platform/device/stream.h" #include "unsupported/Eigen/CXX11/Tensor" namespace Eigen { @@ -815,6 +818,47 @@ class MKLDNNDeviceContext : public CPUDeviceContext { }; #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class CustomDeviceContext : public DeviceContext { + public: + explicit CustomDeviceContext(CustomPlace place); + virtual ~CustomDeviceContext(); + + const Place& GetPlace() const override; + void Wait() const override; + Eigen::DefaultDevice* eigen_device() const { return nullptr; } + C_Stream stream() const { + return reinterpret_cast(stream_->raw_stream()); + } + + template + void AddStreamCallback(Callback&& callback) const { + return stream_->AddCallback(callback); + } + + void WaitStreamCallback() const { return stream_->WaitCallback(); } + + private: + std::string device_type_; + + CustomPlace place_; + + std::shared_ptr stream_; + + CustomDeviceContext(); + DISABLE_COPY_AND_ASSIGN(CustomDeviceContext); +}; +template <> +struct DefaultDeviceContextType { + using TYPE = CustomDeviceContext; +}; +#else +template <> +struct DefaultDeviceContextType { + using TYPE = DeviceContext; +}; +#endif + /*! \brief device context pool singleton */ class DeviceContextPool { public: diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index b969ba971b6b1ec2ca1ad6e8c0c28fdf07bb6431..39f95a9295661b2b3432d7ca062b2bdb1fe5c40a 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) /** * Memory related FLAG diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index f7a86e5aac765c68e3f11e8adcfdf1c9a75aba7c..5d0fccf9e9d4188e66ac54213271ac7cb10d019e 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -25,6 +25,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cupti.h" #endif +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" @@ -234,6 +235,19 @@ void InitDevices(const std::vector devices) { if (!custom_kernel_root.empty()) { LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root; framework::LoadCustomKernel(custom_kernel_root); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::LoadCustomDevice(custom_kernel_root)) { + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (auto &dev_type : device_types) { + VLOG(1) << "Device type: " << dev_type << ", visible devices count: " + << platform::DeviceManager::GetDeviceCount(dev_type); + for (size_t i = 0; + i < platform::DeviceManager::GetDeviceCount(dev_type); i++) { + places.push_back(platform::CustomPlace(dev_type, i)); + } + } + } +#endif } else { VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty."; } diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index e73e3736f64b462f03e6cda1e6212fcfe55c9939..b73e2e398f270646b19cca06274e549a4a4b62ba 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -56,7 +56,16 @@ bool is_npu_pinned_place(const Place &p) { return p.GetType() == pten::AllocationType::NPUPINNED; } +bool is_custom_place(const Place &p) { + return p.GetType() == pten::AllocationType::CUSTOM; +} + bool places_are_same_class(const Place &p1, const Place &p2) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (is_custom_place(p1) && is_custom_place(p2)) { + return p1.GetDeviceType() == p2.GetDeviceType(); + } +#endif return p1.GetType() == p2.GetType(); } @@ -73,6 +82,8 @@ bool is_same_place(const Place &p1, const Place &p2) { return p1 == p2; } else if (is_ipu_place(p1)) { return p1 == p2; + } else if (is_custom_place(p1)) { + return p1 == p2; } else { return p1 == p2; } @@ -81,5 +92,43 @@ bool is_same_place(const Place &p1, const Place &p2) { } } +#ifdef PADDLE_WITH_CUSTOM_DEVICE +std::string PlaceHelper::GetDeviceType(const Place &place) { + if (is_cpu_place(place)) { + return "cpu"; + } else if (is_gpu_place(place)) { + return "gpu"; + } else if (is_npu_place(place)) { + return "npu"; + } else if (is_xpu_place(place)) { + return "xpu"; + } else if (is_custom_place(place)) { + return place.GetDeviceType(); + } else { + PADDLE_THROW(platform::errors::Fatal( + "Unknown device type. Please check available devices by " + "paddle.device.get_available_device()")); + } +} + +size_t PlaceHelper::GetDeviceId(const Place &place) { + return place.GetDeviceId(); +} + +Place PlaceHelper::CreatePlace(const std::string &dev_type, size_t dev_id) { + if (dev_type == "cpu") { + return platform::CPUPlace(); + } else if (dev_type == "gpu") { + return platform::CUDAPlace(dev_id); + } else if (dev_type == "npu") { + return platform::NPUPlace(dev_id); + } else if (dev_type == "xpu") { + return platform::XPUPlace(dev_id); + } else { + return platform::CustomPlace(dev_type, dev_id); + } +} +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 80bbeac251810b6d32167433292fc55c3105234e..278bfad003cd444143fc98f3f8382687073cc483 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -36,9 +36,19 @@ using NPUPinnedPlace = pten::NPUPinnedPlace; using XPUPlace = pten::XPUPlace; using IPUPlace = pten::IPUPlace; using MLUPlace = pten::MLUPlace; +using CustomPlace = pten::CustomPlace; using PlaceList = std::vector; +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class PlaceHelper { + public: + static std::string GetDeviceType(const Place &place); + static size_t GetDeviceId(const Place &place); + static Place CreatePlace(const std::string &dev_type, size_t dev_id = 0); +}; +#endif + bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); bool is_npu_place(const Place &); @@ -47,6 +57,7 @@ bool is_ipu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool is_npu_pinned_place(const Place &); +bool is_custom_place(const Place &p); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); @@ -121,6 +132,15 @@ typename Visitor::result_type VisitPlace(const Place &place, #else PADDLE_THROW(platform::errors::Unavailable( "Paddle is not compiled with MLU. Cannot visit mlu device")); +#endif + } + case pten::AllocationType::CUSTOM: { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + platform::CustomPlace p(place.GetDeviceType(), place.GetDeviceId()); + return visitor(p); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUSTOM. Cannot visit custom device")); #endif } default: { diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 626847f04653cae1acec7dc06d594700aa5d1d70..320e989bd9bb1881e7f1ad0d6d5506fb6e313e24 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -5,3 +5,4 @@ cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger) cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) +add_subdirectory(dump) diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e25333f7a8a73864137a85bc64fe28506b86e081 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt @@ -0,0 +1,4 @@ +proto_library(nodetreeproto SRCS nodetree.proto) +cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node) +cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node) +cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS serialization_logger deserialization_reader event_node) diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1049a7dc190845dc91013f688a27224f5e26b0e --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" + +#include + +namespace paddle { +namespace platform { + +DeserializationReader::DeserializationReader(const std::string& filename) + : filename_(filename) { + OpenFile(); + node_trees_proto_ = new NodeTreesProto(); +} + +DeserializationReader::DeserializationReader(const char* filename) + : filename_(filename) { + OpenFile(); + node_trees_proto_ = new NodeTreesProto(); +} + +void DeserializationReader::OpenFile() { + input_file_stream_.open(filename_, std::ifstream::in | std::ifstream::binary); + if (!input_file_stream_) { + VLOG(2) << "Unable to open file for writing profiling data." << std::endl; + } else { + VLOG(0) << "Read profiling data from " << filename_ << std::endl; + } +} + +std::unique_ptr DeserializationReader::Parse() { + if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) { + VLOG(2) << "Unable to load node trees in protobuf." << std::endl; + return nullptr; + } + std::map thread_event_trees_map; + for (int node_tree_index = 0; + node_tree_index < node_trees_proto_->thread_trees_size(); + node_tree_index++) { + // handle one thread tree + std::map index_node_map; + std::map child_parent_map; + const ThreadNodeTreeProto& thread_node_tree_proto = + node_trees_proto_->thread_trees(node_tree_index); + uint64_t current_threadid = thread_node_tree_proto.thread_id(); + for (int host_node_index = 0; + host_node_index < thread_node_tree_proto.host_nodes_size(); + host_node_index++) { + // handle host node + const HostTraceEventNodeProto& host_node_proto = + thread_node_tree_proto.host_nodes(host_node_index); + HostTraceEventNode* host_node = + RestoreHostTraceEventNode(host_node_proto); + index_node_map[host_node_proto.id()] = host_node; + child_parent_map[host_node_proto.id()] = host_node_proto.parentid(); + // handle runtime node + for (int runtime_node_index = 0; + runtime_node_index < host_node_proto.runtime_nodes_size(); + runtime_node_index++) { + const CudaRuntimeTraceEventNodeProto& runtime_node_proto = + host_node_proto.runtime_nodes(runtime_node_index); + CudaRuntimeTraceEventNode* runtime_node = + RestoreCudaRuntimeTraceEventNode(runtime_node_proto); + host_node->AddCudaRuntimeNode(runtime_node); // insert into host_node + // handle device node + for (int device_node_index = 0; + device_node_index < runtime_node_proto.device_nodes_size(); + device_node_index++) { + const DeviceTraceEventNodeProto& device_node_proto = + runtime_node_proto.device_nodes(device_node_index); + DeviceTraceEventNode* device_node = + RestoreDeviceTraceEventNode(device_node_proto); + runtime_node->AddDeviceTraceEventNode( + device_node); // insert into runtime_node + } + } + } + // restore parent-child relationship + for (auto it = child_parent_map.begin(); it != child_parent_map.end(); + it++) { + if (it->second != -1) { // not root node + index_node_map[it->second]->AddChild(index_node_map[it->first]); + } else { + thread_event_trees_map[current_threadid] = + index_node_map[it->first]; // root node + } + } + } + // restore NodeTrees object + return std::unique_ptr(new NodeTrees(thread_event_trees_map)); +} + +DeserializationReader::~DeserializationReader() { + delete node_trees_proto_; + input_file_stream_.close(); +} + +DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode( + const DeviceTraceEventNodeProto& device_node_proto) { + const DeviceTraceEventProto& device_event_proto = + device_node_proto.device_event(); + DeviceTraceEvent device_event; + device_event.name = device_event_proto.name(); + device_event.type = static_cast(device_event_proto.type()); + device_event.start_ns = device_event_proto.start_ns(); + device_event.end_ns = device_event_proto.end_ns(); + device_event.device_id = device_event_proto.device_id(); + device_event.context_id = device_event_proto.context_id(); + device_event.stream_id = device_event_proto.stream_id(); + device_event.correlation_id = device_event_proto.correlation_id(); + switch (device_event.type) { + case TracerEventType::Kernel: + device_event.kernel_info = HandleKernelEventInfoProto(device_event_proto); + break; + + case TracerEventType::Memcpy: + device_event.memcpy_info = HandleMemcpyEventInfoProto(device_event_proto); + break; + + case TracerEventType::Memset: + device_event.memset_info = HandleMemsetEventInfoProto(device_event_proto); + break; + default: + break; + } + return new DeviceTraceEventNode(device_event); +} + +CudaRuntimeTraceEventNode* +DeserializationReader::RestoreCudaRuntimeTraceEventNode( + const CudaRuntimeTraceEventNodeProto& runtime_node_proto) { + const CudaRuntimeTraceEventProto& runtime_event_proto = + runtime_node_proto.runtime_trace_event(); + RuntimeTraceEvent runtime_event; + runtime_event.name = runtime_event_proto.name(); + runtime_event.start_ns = runtime_event_proto.start_ns(); + runtime_event.end_ns = runtime_event_proto.end_ns(); + runtime_event.process_id = runtime_event_proto.process_id(); + runtime_event.thread_id = runtime_event_proto.thread_id(); + runtime_event.correlation_id = runtime_event_proto.correlation_id(); + runtime_event.callback_id = runtime_event_proto.callback_id(); + return new CudaRuntimeTraceEventNode(runtime_event); +} + +HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode( + const HostTraceEventNodeProto& host_node_proto) { + const HostTraceEventProto& host_event_proto = + host_node_proto.host_trace_event(); + HostTraceEvent host_event; + host_event.name = host_event_proto.name(); + host_event.type = static_cast(host_event_proto.type()); + host_event.start_ns = host_event_proto.start_ns(); + host_event.end_ns = host_event_proto.end_ns(); + host_event.process_id = host_event_proto.process_id(); + host_event.thread_id = host_event_proto.thread_id(); + return new HostTraceEventNode(host_event); +} + +KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( + const DeviceTraceEventProto& device_event_proto) { + const KernelEventInfoProto& kernel_info_proto = + device_event_proto.kernel_info(); + KernelEventInfo kernel_info; + kernel_info.block_x = kernel_info_proto.block_x(); + kernel_info.block_y = kernel_info_proto.block_y(); + kernel_info.block_z = kernel_info_proto.block_z(); + kernel_info.grid_x = kernel_info_proto.grid_x(); + kernel_info.grid_y = kernel_info_proto.grid_y(); + kernel_info.grid_z = kernel_info_proto.grid_z(); + kernel_info.dynamic_shared_memory = kernel_info_proto.dynamic_shared_memory(); + kernel_info.static_shared_memory = kernel_info_proto.static_shared_memory(); + kernel_info.registers_per_thread = kernel_info_proto.registers_per_thread(); + kernel_info.local_memory_per_thread = + kernel_info_proto.local_memory_per_thread(); + kernel_info.local_memory_total = kernel_info_proto.local_memory_total(); + kernel_info.queued = kernel_info_proto.queued(); + kernel_info.submitted = kernel_info_proto.submitted(); + kernel_info.completed = kernel_info_proto.completed(); + return kernel_info; +} + +MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto( + const DeviceTraceEventProto& device_event_proto) { + const MemcpyEventInfoProto& memcpy_info_proto = + device_event_proto.memcpy_info(); + MemcpyEventInfo memcpy_info; + memcpy_info.num_bytes = memcpy_info_proto.num_bytes(); + std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(), + kMemKindMaxLen - 1); + std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(), + kMemKindMaxLen - 1); + std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(), + kMemKindMaxLen - 1); + return memcpy_info; +} + +MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto( + const DeviceTraceEventProto& device_event_proto) { + const MemsetEventInfoProto& memset_info_proto = + device_event_proto.memset_info(); + MemsetEventInfo memset_info; + memset_info.num_bytes = memset_info_proto.num_bytes(); + std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(), + kMemKindMaxLen - 1); + memset_info.value = memset_info_proto.value(); + return memset_info; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h new file mode 100644 index 0000000000000000000000000000000000000000..1ad2dabf229ad5665db6cc9f9ec43470f0b232f3 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" +#include "paddle/fluid/platform/profiler/event_node.h" + +namespace paddle { +namespace platform { + +class DeserializationReader { + public: + explicit DeserializationReader(const std::string& filename); + explicit DeserializationReader(const char* filename); + ~DeserializationReader(); + std::unique_ptr Parse(); + + private: + void OpenFile(); + DeviceTraceEventNode* RestoreDeviceTraceEventNode( + const DeviceTraceEventNodeProto&); + CudaRuntimeTraceEventNode* RestoreCudaRuntimeTraceEventNode( + const CudaRuntimeTraceEventNodeProto&); + HostTraceEventNode* RestoreHostTraceEventNode(const HostTraceEventNodeProto&); + KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&); + MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&); + MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&); + std::string filename_; + std::ifstream input_file_stream_; + NodeTreesProto* node_trees_proto_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto new file mode 100644 index 0000000000000000000000000000000000000000..37dac0e597ce208da05271ff88c6f28b3c9dd9f9 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -0,0 +1,181 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.platform; + +enum TracerEventTypeProto { + // Used to mark operator record + Operator = 0; + // Used to mark dataloader record + Dataloader = 1; + // Used to mark profile step record + ProfileStep = 2; + // Used to mark cuda runtime record returned by cupti + CudaRuntime = 3; + // Used to mark kernel computation record returned by cupti + Kernel = 4; + // Used to mark memcpy record returned by cupti + Memcpy = 5; + // Used to mark memset record returned by cupti + Memset = 6; + // Used to mark record defined by user + UserDefined = 7; + // A flag to denote the number of current types + NumTypes = 8; +} + +message KernelEventInfoProto { + // The X-dimension block size for the kernel. + required uint32 block_x = 1; + // The Y-dimension block size for the kernel. + required uint32 block_y = 2; + // The Z-dimension grid size for the kernel. + required uint32 block_z = 3; + // X-dimension of a grid. + required uint32 grid_x = 4; + // Y-dimension of a grid. + required uint32 grid_y = 5; + // Z-dimension of a grid. + required uint32 grid_z = 6; + // The dynamic shared memory reserved for the kernel, in bytes. + required uint32 dynamic_shared_memory = 7; + // The static shared memory allocated for the kernel, in bytes. + required uint32 static_shared_memory = 8; + // The number of registers required for each thread executing the kernel. + required uint32 registers_per_thread = 9; + // The amount of local memory reserved for each thread, in bytes. + required uint32 local_memory_per_thread = 10; + // The total amount of local memory reserved for the kernel, in bytes. + required uint32 local_memory_total = 11; + // The timestamp when the kernel is queued up in the command buffer, in ns. + // This timestamp is not collected by default. Use API + // cuptiActivityEnableLatencyTimestamps() to enable collection. + required uint64 queued = 12; + // The timestamp when the command buffer containing the kernel launch is + // submitted to the GPU, in ns. + // This timestamp is not collected by default. Use API + // cuptiActivityEnableLatencyTimestamps() to enable collection. + required uint64 submitted = 13; + // The completed timestamp for the kernel execution, in ns. + required uint64 completed = 14; +} + +message MemcpyEventInfoProto { + // The number of bytes transferred by the memory copy. + required uint64 num_bytes = 1; + // The kind of the memory copy. + // Each kind represents the source and destination targets of a memory copy. + // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind + required string copy_kind = 2; + // The source memory kind read by the memory copy. + // Each kind represents the type of the memory accessed by a memory + // operation/copy. Refer to CUpti_ActivityMemoryKind + required string src_kind = 3; + // The destination memory kind read by the memory copy. + required string dst_kind = 4; +} + +message MemsetEventInfoProto { + // The number of bytes being set by the memory set. + required uint64 num_bytes = 1; + // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind + required string memory_kind = 2; + // the value being assigned to memory by the memory set. + required uint32 value = 3; +} + +message HostTraceEventProto { + required string name = 1; + required TracerEventTypeProto type = 2; + // start timestamp of the record + required uint64 start_ns = 3; + // end timestamp of the record + required uint64 end_ns = 4; + // process id of the record + required uint64 process_id = 5; + // thread id of the record + required uint64 thread_id = 6; +} + +message CudaRuntimeTraceEventProto { + // record name + required string name = 1; + // start timestamp of the record + required uint64 start_ns = 2; + // end timestamp of the record + required uint64 end_ns = 3; + // process id of the record + required uint64 process_id = 4; + // thread id of the record + required uint64 thread_id = 5; + // correlation id, used for correlating async activities happened on device + required uint32 correlation_id = 6; + // callback id, used to identify which cuda runtime api is called + required uint32 callback_id = 7; +} + +message DeviceTraceEventProto { + // record name + required string name = 1; + // record type, one of TracerEventType + required TracerEventTypeProto type = 2; + // start timestamp of the record + required uint64 start_ns = 3; + // end timestamp of the record + required uint64 end_ns = 4; + // device id + required uint64 device_id = 5; + // context id + required uint64 context_id = 6; + // stream id + required uint64 stream_id = 7; + // correlation id, used for correlating async activities happened on device + required uint32 correlation_id = 8; + // union, specific device record type has different detail information + oneof detail_info { + // used for TracerEventType::Kernel + KernelEventInfoProto kernel_info = 9; + // used for TracerEventType::Memcpy + MemcpyEventInfoProto memcpy_info = 10; + // used for TracerEventType::Memset + MemsetEventInfoProto memset_info = 11; + } +} + +message DeviceTraceEventNodeProto { + required DeviceTraceEventProto device_event = 1; +} + +message CudaRuntimeTraceEventNodeProto { + required CudaRuntimeTraceEventProto runtime_trace_event = 1; + repeated DeviceTraceEventNodeProto device_nodes = 2; +} + +message HostTraceEventNodeProto { + required int64 id = 1; + required int64 parentid = 2; + required HostTraceEventProto host_trace_event = 3; + repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4; +} + +message ThreadNodeTreeProto { + required uint64 thread_id = 1; + repeated HostTraceEventNodeProto host_nodes = 2; +} + +message NodeTreesProto { + required string version = 1; + repeated ThreadNodeTreeProto thread_trees = 2; +} diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc new file mode 100644 index 0000000000000000000000000000000000000000..d9ed84bd438a7e2ac95a6637b6efcae870a8ad75 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -0,0 +1,265 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "glog/logging.h" + +#include "paddle/fluid/platform/profiler/dump/serialization_logger.h" +#include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/utils.h" + +namespace paddle { +namespace platform { + +static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; +static const char* version = "1.0.0"; + +static std::string DefaultFileName() { + auto pid = GetProcessId(); + return string_format(std::string(kDefaultFilename), pid, + GetStringFormatLocalTime().c_str()); +} + +void SerializationLogger::OpenFile() { + output_file_stream_.open(filename_, std::ofstream::out | + std::ofstream::trunc | + std::ofstream::binary); + if (!output_file_stream_) { + LOG(WARNING) << "Unable to open file for writing profiling data." + << std::endl; + } else { + LOG(INFO) << "writing profiling data to " << filename_ << std::endl; + } + node_trees_proto_ = new NodeTreesProto(); + node_trees_proto_->set_version(std::string(version)); +} + +void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { + // dump the whole tree into file + const std::map> + thread2host_event_nodes = node_trees.Traverse(true); + + for (auto it = thread2host_event_nodes.begin(); + it != thread2host_event_nodes.end(); ++it) { + // 1. order every node an index, every node a parent + std::map node_index_map; + std::map node_parent_map; + int64_t index = 0; + for (auto hostnode = it->second.begin(); hostnode != it->second.end(); + ++hostnode) { + node_index_map[(*hostnode)] = index; // order each node + index++; + } + node_parent_map[(*(it->second.begin()))] = -1; // root's parent set as -1 + for (auto hostnode = it->second.begin(); hostnode != it->second.end(); + ++hostnode) { + for (auto childnode = (*hostnode)->GetChildren().begin(); + childnode != (*hostnode)->GetChildren().end(); ++childnode) { + node_parent_map[(*childnode)] = + node_index_map[(*hostnode)]; // mark each node's parent + } + } + + // 2. serialize host node, runtime node and device node + current_thread_node_tree_proto_ = + node_trees_proto_->add_thread_trees(); // add ThreadNodeTreeProto + current_thread_node_tree_proto_->set_thread_id(it->first); + for (auto hostnode = it->second.begin(); hostnode != it->second.end(); + ++hostnode) { + HostTraceEventNodeProto* host_node_proto = + current_thread_node_tree_proto_ + ->add_host_nodes(); // add HostTraceEventNodeProto + host_node_proto->set_id(node_index_map[(*hostnode)]); + host_node_proto->set_parentid(node_parent_map[(*hostnode)]); + current_host_trace_event_node_proto_ = + host_node_proto; // set current HostTraceEventNodeProto + (*hostnode)->LogMe(this); // fill detail information + + for (auto runtimenode = (*hostnode)->GetRuntimeTraceEventNodes().begin(); + runtimenode != (*hostnode)->GetRuntimeTraceEventNodes().end(); + ++runtimenode) { + CudaRuntimeTraceEventNodeProto* runtime_node_proto = + current_host_trace_event_node_proto_ + ->add_runtime_nodes(); // add CudaRuntimeTraceEventNodeProto + current_runtime_trace_event_node_proto_ = + runtime_node_proto; // set current CudaRuntimeTraceEventNodeProto + (*runtimenode)->LogMe(this); // fill detail information + for (auto devicenode = + (*runtimenode)->GetDeviceTraceEventNodes().begin(); + devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end(); + ++devicenode) { + DeviceTraceEventNodeProto* device_node_proto = + current_runtime_trace_event_node_proto_ + ->add_device_nodes(); // add DeviceTraceEventNodeProto + current_device_trace_event_node_proto_ = + device_node_proto; // set current DeviceTraceEventNodeProto + (*devicenode)->LogMe(this); // fill detail information + } + } + } + } +} + +void SerializationLogger::LogHostTraceEventNode( + const HostTraceEventNode& host_node) { + HostTraceEventProto* host_trace_event = new HostTraceEventProto(); + host_trace_event->set_name(host_node.Name()); + host_trace_event->set_type( + static_cast(host_node.Type())); + host_trace_event->set_start_ns(host_node.StartNs()); + host_trace_event->set_end_ns(host_node.EndNs()); + host_trace_event->set_process_id(host_node.ProcessId()); + host_trace_event->set_thread_id(host_node.ThreadId()); + current_host_trace_event_node_proto_->set_allocated_host_trace_event( + host_trace_event); +} + +void SerializationLogger::LogRuntimeTraceEventNode( + const CudaRuntimeTraceEventNode& runtime_node) { + CudaRuntimeTraceEventProto* runtime_trace_event = + new CudaRuntimeTraceEventProto(); + runtime_trace_event->set_name(runtime_node.Name()); + runtime_trace_event->set_start_ns(runtime_node.StartNs()); + runtime_trace_event->set_end_ns(runtime_node.EndNs()); + runtime_trace_event->set_process_id(runtime_node.ProcessId()); + runtime_trace_event->set_thread_id(runtime_node.ThreadId()); + runtime_trace_event->set_correlation_id(runtime_node.CorrelationId()); + runtime_trace_event->set_callback_id(runtime_node.CallbackId()); + current_runtime_trace_event_node_proto_->set_allocated_runtime_trace_event( + runtime_trace_event); +} + +void SerializationLogger::LogDeviceTraceEventNode( + const DeviceTraceEventNode& device_node) { + switch (device_node.Type()) { + case TracerEventType::Kernel: + HandleTypeKernel(device_node); + break; + case TracerEventType::Memcpy: + HandleTypeMemcpy(device_node); + break; + case TracerEventType::Memset: + HandleTypeMemset(device_node); + break; + default: + break; + } +} + +void SerializationLogger::HandleTypeKernel( + const DeviceTraceEventNode& device_node) { + DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto(); + KernelEventInfoProto* kernel_info = new KernelEventInfoProto(); + // fill DeviceTraceEventProto + device_trace_event->set_name(device_node.Name()); + device_trace_event->set_type( + static_cast(device_node.Type())); + device_trace_event->set_start_ns(device_node.StartNs()); + device_trace_event->set_end_ns(device_node.EndNs()); + device_trace_event->set_device_id(device_node.DeviceId()); + device_trace_event->set_context_id(device_node.ContextId()); + device_trace_event->set_stream_id(device_node.StreamId()); + device_trace_event->set_correlation_id(device_node.CorrelationId()); + // fill KernelEventInfoProto + KernelEventInfo info = device_node.KernelInfo(); + kernel_info->set_block_x(info.block_x); + kernel_info->set_block_y(info.block_y); + kernel_info->set_block_z(info.block_z); + kernel_info->set_grid_x(info.grid_x); + kernel_info->set_grid_y(info.grid_y); + kernel_info->set_grid_z(info.grid_z); + kernel_info->set_dynamic_shared_memory(info.dynamic_shared_memory); + kernel_info->set_static_shared_memory(info.static_shared_memory); + kernel_info->set_registers_per_thread(info.registers_per_thread); + kernel_info->set_local_memory_per_thread(info.local_memory_per_thread); + kernel_info->set_local_memory_total(info.local_memory_total); + kernel_info->set_queued(info.queued); + kernel_info->set_submitted(info.submitted); + kernel_info->set_completed(info.completed); + // binding + device_trace_event->set_allocated_kernel_info(kernel_info); + current_device_trace_event_node_proto_->set_allocated_device_event( + device_trace_event); +} + +void SerializationLogger::HandleTypeMemcpy( + const DeviceTraceEventNode& device_node) { + DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto(); + MemcpyEventInfoProto* memcpy_info = new MemcpyEventInfoProto(); + // fill DeviceTraceEventProto + device_trace_event->set_name(device_node.Name()); + device_trace_event->set_type( + static_cast(device_node.Type())); + device_trace_event->set_start_ns(device_node.StartNs()); + device_trace_event->set_end_ns(device_node.EndNs()); + device_trace_event->set_device_id(device_node.DeviceId()); + device_trace_event->set_context_id(device_node.ContextId()); + device_trace_event->set_stream_id(device_node.StreamId()); + device_trace_event->set_correlation_id(device_node.CorrelationId()); + // fill MemcpyEventInfoProto + MemcpyEventInfo info = device_node.MemcpyInfo(); + memcpy_info->set_num_bytes(info.num_bytes); + memcpy_info->set_copy_kind(std::string(info.copy_kind)); + memcpy_info->set_src_kind(std::string(info.src_kind)); + memcpy_info->set_dst_kind(std::string(info.dst_kind)); + // binding + device_trace_event->set_allocated_memcpy_info(memcpy_info); + current_device_trace_event_node_proto_->set_allocated_device_event( + device_trace_event); +} + +void SerializationLogger::HandleTypeMemset( + const DeviceTraceEventNode& device_node) { + DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto(); + MemsetEventInfoProto* memset_info = new MemsetEventInfoProto(); + // fill DeviceTraceEventProto + device_trace_event->set_name(device_node.Name()); + device_trace_event->set_type( + static_cast(device_node.Type())); + device_trace_event->set_start_ns(device_node.StartNs()); + device_trace_event->set_end_ns(device_node.EndNs()); + device_trace_event->set_device_id(device_node.DeviceId()); + device_trace_event->set_context_id(device_node.ContextId()); + device_trace_event->set_stream_id(device_node.StreamId()); + device_trace_event->set_correlation_id(device_node.CorrelationId()); + // fill MemsetEventInfoProto + MemsetEventInfo info = device_node.MemsetInfo(); + memset_info->set_num_bytes(info.num_bytes); + memset_info->set_memory_kind(std::string(info.memory_kind)); + memset_info->set_value(info.value); + // binding + device_trace_event->set_allocated_memset_info(memset_info); + current_device_trace_event_node_proto_->set_allocated_device_event( + device_trace_event); +} + +SerializationLogger::SerializationLogger(const std::string& filename) { + filename_ = filename.empty() ? DefaultFileName() : filename; + OpenFile(); +} + +SerializationLogger::SerializationLogger(const char* filename_cstr) { + std::string filename(filename_cstr); + filename_ = filename.empty() ? DefaultFileName() : filename; + OpenFile(); +} + +SerializationLogger::~SerializationLogger() { + if (!output_file_stream_) { + delete node_trees_proto_; + return; + } + node_trees_proto_->SerializeToOstream(&output_file_stream_); + delete node_trees_proto_; + output_file_stream_.close(); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h new file mode 100755 index 0000000000000000000000000000000000000000..1295be95d45316d6884b68b3115caefa7905d673 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" +#include "paddle/fluid/platform/profiler/output_logger.h" + +namespace paddle { +namespace platform { + +// Dump a NodeTrees into a profobuf file. +// A SerializationLogger object can only dump a NodeTrees object, +// creates a file in the constructor and closes the file in the destructor. +class SerializationLogger : public BaseLogger { + public: + explicit SerializationLogger(const std::string& filename); + explicit SerializationLogger(const char* filename); + ~SerializationLogger(); + std::string filename() { return filename_; } + void LogDeviceTraceEventNode(const DeviceTraceEventNode&) override; + void LogHostTraceEventNode(const HostTraceEventNode&) override; + void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; + void LogNodeTrees(const NodeTrees&) override; + + private: + void OpenFile(); + void HandleTypeKernel(const DeviceTraceEventNode&); + void HandleTypeMemset(const DeviceTraceEventNode&); + void HandleTypeMemcpy(const DeviceTraceEventNode&); + std::string filename_; + std::ofstream output_file_stream_; + NodeTreesProto* node_trees_proto_; + ThreadNodeTreeProto* current_thread_node_tree_proto_; + HostTraceEventNodeProto* current_host_trace_event_node_proto_; + CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_; + DeviceTraceEventNodeProto* current_device_trace_event_node_proto_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc new file mode 100644 index 0000000000000000000000000000000000000000..2fe9626ec76df5654d19e785d043311f5f00496e --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" +#include "paddle/fluid/platform/profiler/dump/serialization_logger.h" +#include "paddle/fluid/platform/profiler/event_node.h" + +using paddle::platform::SerializationLogger; +using paddle::platform::DeserializationReader; +using paddle::platform::NodeTrees; +using paddle::platform::HostTraceEventNode; +using paddle::platform::CudaRuntimeTraceEventNode; +using paddle::platform::DeviceTraceEventNode; +using paddle::platform::HostTraceEvent; +using paddle::platform::RuntimeTraceEvent; +using paddle::platform::DeviceTraceEvent; +using paddle::platform::TracerEventType; +using paddle::platform::KernelEventInfo; +using paddle::platform::MemcpyEventInfo; +using paddle::platform::MemsetEventInfo; + +TEST(SerializationLoggerTest, dump_case0) { + std::list host_events; + std::list runtime_events; + std::list device_events; + host_events.push_back(HostTraceEvent(std::string("dataloader#1"), + TracerEventType::Dataloader, 1000, 10000, + 10, 10)); + host_events.push_back(HostTraceEvent( + std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10)); + host_events.push_back(HostTraceEvent( + std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); + host_events.push_back(HostTraceEvent( + std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, + 17000, 10, 10, 1, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, + 35000, 10, 10, 2, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, + 37000, 10, 11, 3, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, + 19000, 10, 10, 4, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, + 39000, 10, 11, 5, 0)); + device_events.push_back( + DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, + 55000, 0, 10, 10, 1, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, + 95000, 0, 10, 10, 2, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, + 65000, 0, 10, 11, 3, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, + 59000, 0, 10, 10, 4, MemcpyEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, + 69000, 0, 10, 11, 5, MemsetEventInfo())); + SerializationLogger logger("test_serialization_logger_case0.pb"); + NodeTrees tree(host_events, runtime_events, device_events); + std::map> nodes = + tree.Traverse(true); + EXPECT_EQ(nodes[10].size(), 4u); + EXPECT_EQ(nodes[11].size(), 2u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 3u); + } + if ((*it)->Name() == "op1") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "op3") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + tree.LogMe(&logger); +} + +TEST(SerializationLoggerTest, dump_case1) { + std::list host_events; + std::list runtime_events; + std::list device_events; + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, + 17000, 10, 10, 1, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, + 35000, 10, 10, 2, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, + 37000, 10, 11, 3, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, + 19000, 10, 10, 4, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, + 39000, 10, 11, 5, 0)); + device_events.push_back( + DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, + 55000, 0, 10, 10, 1, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, + 95000, 0, 10, 10, 2, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, + 65000, 0, 10, 11, 3, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, + 59000, 0, 10, 10, 4, MemcpyEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, + 69000, 0, 10, 11, 5, MemsetEventInfo())); + SerializationLogger logger("test_serialization_logger_case1.pb"); + NodeTrees tree(host_events, runtime_events, device_events); + std::map> nodes = + tree.Traverse(true); + EXPECT_EQ(nodes[10].size(), 1u); + EXPECT_EQ(nodes[11].size(), 1u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + tree.LogMe(&logger); +} + +TEST(DeserializationReaderTest, restore_case0) { + DeserializationReader reader("test_serialization_logger_case0.pb"); + std::unique_ptr tree = reader.Parse(); + std::map> nodes = + tree->Traverse(true); + EXPECT_EQ(nodes[10].size(), 4u); + EXPECT_EQ(nodes[11].size(), 2u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 3u); + } + if ((*it)->Name() == "op1") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "op3") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } +} diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..04014b972c3e3599beef0a60635fa122a153233f --- /dev/null +++ b/paddle/fluid/platform/profiler/utils.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/platform/os_info.h" + +namespace paddle { +namespace platform { + +template +std::string string_format(const std::string& format, Args... args) { + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + + 1; // Extra space for '\0' + PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal( + "Error during profiler data formatting.")); + auto size = static_cast(size_s); + auto buf = std::make_unique(size); + std::snprintf(buf.get(), size, format.c_str(), args...); + return std::string(buf.get(), size - 1); // exclude the '\0' +} + +static std::string GetStringFormatLocalTime() { + std::time_t rawtime; + std::tm* timeinfo; + char buf[100]; + std::time(&rawtime); + timeinfo = std::localtime(&rawtime); + std::strftime(buf, 100, "%F-%X", timeinfo); + return std::string(buf); +} + +static int64_t nsToUs(int64_t ns) { return ns / 1000; } + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 265f0fba8f376e5c4e748415469f1b4caab1d4c4..b1fe9f99b5d428d735a6e6734ccd5d7d6faa74e8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -284,7 +284,7 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB}) if(NOT APPLE AND NOT WIN32) target_link_libraries(paddle_pybind rt) diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index 0422a9cf8cc0ad984621fe09ee28bb7d624897d6..7bb7f03983eb9e8c88f46174a40664f1110682d1 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -151,14 +151,9 @@ void BindFleetExecutor(py::module* m) { .def_readwrite("current_endpoint", &DistModelConfig::current_endpoint) .def_readwrite("nranks", &DistModelConfig::nranks) .def_readwrite("local_rank", &DistModelConfig::local_rank) - .def_readwrite("mp_degree", &DistModelConfig::mp_degree) - .def_readwrite("pp_degree", &DistModelConfig::pp_degree) - .def_readwrite("mp_ring_id", &DistModelConfig::mp_ring_id) - .def_readwrite("enable_timer", &DistModelConfig::enable_timer) - .def_readwrite("pp_upstream_ring_id", - &DistModelConfig::pp_upstream_ring_id) - .def_readwrite("pp_downstream_ring_id", - &DistModelConfig::pp_downstream_ring_id); + .def_readwrite("ring_id_to_ranks", &DistModelConfig::ring_id_to_ranks_) + .def_readwrite("rank_to_ring_ids", &DistModelConfig::rank_to_ring_ids_) + .def_readwrite("enable_timer", &DistModelConfig::enable_timer); py::class_(*m, "DistModel") .def(py::init()) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index f4e5df800dadaa9774062f704fb93b7a0ac746a9..6e882b5e0e4b07dd67a6b59747d2a89a6cc59fb7 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -45,7 +45,7 @@ PyTypeObject* p_tensor_type; extern PyTypeObject* g_vartype_pytype; extern PyTypeObject* g_framework_tensor_pytype; -PyObject* EagerTensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { +PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { PyObject* obj = type->tp_alloc(type, 0); if (obj) { auto v = reinterpret_cast(obj); @@ -56,14 +56,14 @@ PyObject* EagerTensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { } // TODO(jiabin): Overload this once we need more constructor in Python -void EmptyEagerTensorInitializer( - TensorObject* self, const std::string& name, - const paddle::platform::Place& place, bool persistable = false, - bool stop_gradient = true, framework::proto::VarType::Type dtype = - paddle::framework::proto::VarType::FP32, - const std::vector& dims = {}, - framework::proto::VarType::Type var_type = - paddle::framework::proto::VarType::LOD_TENSOR) { +void EmptyTensorInitializer(TensorObject* self, const std::string& name, + const paddle::platform::Place& place, + bool persistable = false, bool stop_gradient = true, + framework::proto::VarType::Type dtype = + paddle::framework::proto::VarType::FP32, + const std::vector& dims = {}, + framework::proto::VarType::Type var_type = + paddle::framework::proto::VarType::LOD_TENSOR) { auto ddims = paddle::framework::make_ddim(dims); PADDLE_ENFORCE_GE( paddle::framework::product(ddims), 0, @@ -98,46 +98,41 @@ void EmptyEagerTensorInitializer( } } -void InitEagerTensorWithNumpyValue(TensorObject* self, const py::object& array, - bool zero_copy = false) { +void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, + bool zero_copy = false) { PADDLE_ENFORCE_EQ( self->tensor.defined(), true, paddle::platform::errors::Fatal( - "Calling InitEagerTensorWithNumpyValue of Eager Tensor without " - "EmptyEagerTensorInitializer is " + "Calling InitTensorWithNumpyValue of Eager Tensor without " + "EmptyTensorInitializer is " "forbidden. Please check your code and make sure you new a " "eager tensor before init it with NumPy.")); pten::DenseTensor* impl_ptr = static_cast(self->tensor.impl().get()); paddle::platform::Place place = impl_ptr->place(); - paddle::framework::LoDTensor temp_tensor = paddle::framework::LoDTensor(); if (platform::is_cpu_place(place)) { - SetTensorFromPyArray(&temp_tensor, array, place, - zero_copy); + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_xpu_place(place)) { - SetTensorFromPyArray(&temp_tensor, array, place, - zero_copy); + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_gpu_place(place)) { - SetTensorFromPyArray(&temp_tensor, array, place, + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_cuda_pinned_place(place)) { - SetTensorFromPyArray(&temp_tensor, array, place, + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_npu_place(place)) { - SetTensorFromPyArray(&temp_tensor, array, place, - zero_copy); + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace")); } - *impl_ptr = temp_tensor; } -void InitEagerTensorWithEagerTensor(TensorObject* self, - const paddle::experimental::Tensor& src, - const paddle::platform::Place& place, - const std::string& name) { +void InitTensorWithTensor(TensorObject* self, + const paddle::experimental::Tensor& src, + const paddle::platform::Place& place, + const std::string& name) { self->tensor.set_name(name); if (place == src.inner_place()) { auto impl = std::static_pointer_cast(src.impl()); @@ -158,10 +153,10 @@ void InitEagerTensorWithEagerTensor(TensorObject* self, } } -void InitEagerTensorWithFrameworkTensor(TensorObject* self, - const framework::Tensor& src, - const paddle::platform::Place& place, - const std::string& name) { +void InitTensorWithFrameworkTensor(TensorObject* self, + const framework::Tensor& src, + const paddle::platform::Place& place, + const std::string& name) { self->tensor.set_name(name); if (place == src.place()) { self->tensor.set_impl(std::make_shared(src)); @@ -271,14 +266,14 @@ std::string ParseName(std::unordered_map kws_map, return act_name; } -// initialize EagerTensor by PyArray(first argument is PyArray, +// initialize Tensor by PyArray(first argument is PyArray, // mix args and kwargs) automatically. -void AutoInitEagerTensorByPyArray( - TensorObject* py_tensor_ptr, - std::unordered_map kws_map, PyObject* args, - bool flag_kwargs, Py_ssize_t args_num) { - // The first argument of the EagerTensor constructor is PyArray, - // there are 6 arguments to construct the new EagerTensor, +void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr, + std::unordered_map kws_map, + PyObject* args, bool flag_kwargs, + Py_ssize_t args_num) { + // The first argument of the Tensor constructor is PyArray, + // there are 6 arguments to construct the new Tensor, // kw_order_map's key is every arguments of the constructor, // kw_order_map's value is the position of the arguments respectively. // If u want to update this constructor with new arguments, @@ -306,20 +301,21 @@ void AutoInitEagerTensorByPyArray( stop_gradient = ParseBooleanArgs("stop_gradient", kws_map, kw_order_map, args, flag_kwargs, args_num); - EmptyEagerTensorInitializer(py_tensor_ptr, act_name, place, persistable, - stop_gradient); - InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy); + EmptyTensorInitializer(py_tensor_ptr, act_name, place, persistable, + stop_gradient); + InitTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy); } -// initialize EagerTensor by EagerTensor or framework::Tensor (mix args and +// initialize Tensor by Tensor or framework::Tensor (mix args and // kwargs) automatically. -void AutoInitEagerTensorByTensor( - TensorObject* py_tensor_ptr, - std::unordered_map kws_map, PyObject* args, - bool flag_kwargs, Py_ssize_t args_num, bool init_by_egr_tensor = true) { - // The first argument of the EagerTensor constructor is EagerTensor or +void AutoInitTensorByTensor(TensorObject* py_tensor_ptr, + std::unordered_map kws_map, + PyObject* args, bool flag_kwargs, + Py_ssize_t args_num, + bool init_by_egr_tensor = true) { + // The first argument of the Tensor constructor is Tensor or // framework Tensor, - // there are 3 arguments to construct the new EagerTensor, + // there are 3 arguments to construct the new Tensor, // kw_order_map's key is every arguments of the constructor, // kw_order_map's value is the position of the arguments respectively. // If u want to update this constructor with new arguments, @@ -345,14 +341,14 @@ void AutoInitEagerTensorByTensor( src_tensor = CastPyArg2Tensor(kws_map["value"], 0); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "The first expected kwargs is {value: EagerTensor}, " - "but could not parse the first argument {value: EagerTensor} " + "The first expected kwargs is {value: Tensor}, " + "but could not parse the first argument {value: Tensor} " "successfully. " "Please check your input first and make sure you are on the right " "way.")); } } - InitEagerTensorWithEagerTensor(py_tensor_ptr, src_tensor, place, act_name); + InitTensorWithTensor(py_tensor_ptr, src_tensor, place, act_name); } else { // init by framework tensor framework::Tensor src_tensor; @@ -372,8 +368,7 @@ void AutoInitEagerTensorByTensor( "way.")); } } - InitEagerTensorWithFrameworkTensor(py_tensor_ptr, src_tensor, place, - act_name); + InitTensorWithFrameworkTensor(py_tensor_ptr, src_tensor, place, act_name); } } @@ -402,12 +397,12 @@ void AutoInitEagerTensorByTensor( * ** value: ndarray) * 5. * def __init__ ( - * ** tensor: EagerTensor) + * ** tensor: Tensor) * 6. (multi-place) * (should have at least one parameter, one parameter equals to case 5, zero * parameter equals to case 1.) * def __init__ ( - * ** tensor: EagerTensor, + * ** tensor: Tensor, * ** place: paddle::platform::Place, * ** name: std::string) * 7. (multi-place) (should have at least one parameter, one parameter similar @@ -417,7 +412,7 @@ void AutoInitEagerTensorByTensor( * ** place: paddle::platform::Place, * ** name: std::string) * **/ -int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { +int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { // set a flag to record use kwargs or not bool flag_kwargs = false; if (kwargs) flag_kwargs = true; @@ -427,7 +422,7 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { PyObject* kw_persistable = NULL; PyObject* kw_stop_gradient = NULL; - PyObject* kw_value = NULL; // receive PyArray or EagerTensor + PyObject* kw_value = NULL; // receive PyArray or Tensor PyObject* kw_place = NULL; PyObject* kw_name = NULL; PyObject* kw_dims = NULL; @@ -490,7 +485,7 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { if (!flag_kwargs) { // case 1 VLOG(6) << "Calling case1's initializer."; - EmptyEagerTensorInitializer( + EmptyTensorInitializer( py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName("generated_tensor"), egr::Controller::Instance().GetExpectedPlace()); @@ -499,28 +494,28 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { if (kw_value != NULL) { if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) { VLOG(6) << "Calling case3's or case4's initializer"; - AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, - flag_kwargs, args_num); + AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else if (PyObject_IsInstance( kw_value, reinterpret_cast(p_tensor_type))) { VLOG(6) << "Calling case5's or case6's initializer"; - AutoInitEagerTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else if (PyObject_IsInstance(kw_value, reinterpret_cast( g_framework_tensor_pytype))) { VLOG(6) << "Calling case7's initializer."; - AutoInitEagerTensorByTensor( - py_tensor_ptr, kws_map, args, flag_kwargs, args_num, - /* false means not init by egr tensor*/ false); + AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num, + /* false means not init by egr tensor*/ false); return 0; } else { PADDLE_THROW(platform::errors::InvalidArgument( "Could not parse the first keyword argument successfully, " "the first keyword argument is value, but it should be PyArray " - "or EagerTensor or framework::Tensor. " + "or Tensor or framework::Tensor. " "Please check your input first and make sure you are on the " "right way.")); } @@ -573,18 +568,18 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { CastPyArg2ProtoType(kw_type, 0); bool persistable = CastPyArg2AttrBoolean(kw_persistable, 0); - EmptyEagerTensorInitializer( - py_tensor_ptr, act_name, - egr::Controller::Instance().GetExpectedPlace(), persistable, - /* stop_gradient */ true, dtype, dims, var_type); + EmptyTensorInitializer(py_tensor_ptr, act_name, + egr::Controller::Instance().GetExpectedPlace(), + persistable, + /* stop_gradient */ true, dtype, dims, var_type); return 0; } else { PADDLE_THROW(platform::errors::InvalidArgument( - "We not only support construct EagerTensor from numpy value " - "or tensor(EagerTensor or framework::Tensor) " + "We not only support construct Tensor from numpy value " + "or tensor(Tensor or framework::Tensor) " "with python kwargs by this initializer, " - "but also even support dtype to init a empty EagerTensor. " + "but also even support dtype to init a empty Tensor. " "Please check your input first and make sure you call the existed " "constructor.")); } @@ -595,28 +590,28 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { VLOG(6) << "Calling case3's or case4's initializer."; - AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else if (PyObject_IsInstance( arg0_ptr, reinterpret_cast(p_tensor_type))) { VLOG(6) << "Calling case5's or case6's initializer."; - AutoInitEagerTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast( g_framework_tensor_pytype))) { VLOG(6) << "Calling case7's initializer."; - AutoInitEagerTensorByTensor( - py_tensor_ptr, kws_map, args, flag_kwargs, args_num, - /* false means not init by egr tensor*/ false); + AutoInitTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num, + /* false means not init by egr tensor*/ false); return 0; } else { PADDLE_THROW(platform::errors::InvalidArgument( - "We support construct EagerTensor from numpy value " - "or tensor(EagerTensor or framework::Tensor) " + "We support construct Tensor from numpy value " + "or tensor(Tensor or framework::Tensor) " "with python args and kwargs by this initializer, " - "but the first argument should be PyArray or EagerTensor or " + "but the first argument should be PyArray or Tensor or " "framework::Tensor. " "Please check your input first and make sure you call the existed " "constructor.")); @@ -626,8 +621,8 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { VLOG(6) << "Calling case3's or case4's initializer."; - AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -658,15 +653,14 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { paddle::framework::proto::VarType::Type var_type = CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 3), 3); bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); - EmptyEagerTensorInitializer( - py_tensor_ptr, act_name, - egr::Controller::Instance().GetExpectedPlace(), persistable, true, - dtype, dims, var_type); + EmptyTensorInitializer(py_tensor_ptr, act_name, + egr::Controller::Instance().GetExpectedPlace(), + persistable, true, dtype, dims, var_type); return 0; } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { VLOG(6) << "Calling case3's initializer."; - AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -680,8 +674,8 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { VLOG(6) << "Calling case3's or case4's initializer"; - AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -696,8 +690,8 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { if (!flag_kwargs) { // case 3 VLOG(6) << "Calling case3's initializer."; - AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, - args_num); + AutoInitTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); return 0; } else { // six position args, remainting arguments are kwargs, but this // is not a right way @@ -716,7 +710,7 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { return 1; } -static void EagerTensorDealloc(TensorObject* self) { +static void TensorDealloc(TensorObject* self) { self->tensor.~Tensor(); Py_TYPE(self)->tp_free(reinterpret_cast(self)); } @@ -735,19 +729,19 @@ void BindEager(pybind11::module* module) { auto& internals = pybind11::detail::get_internals(); auto heap_type = reinterpret_cast( internals.default_metaclass->tp_alloc(internals.default_metaclass, 0)); - heap_type->ht_name = ToPyObject("EagerTensor"); - heap_type->ht_qualname = ToPyObject("EagerTensor"); + heap_type->ht_name = ToPyObject("Tensor"); + heap_type->ht_qualname = ToPyObject("Tensor"); auto type = &heap_type->ht_type; - type->tp_name = "EagerTensor"; + type->tp_name = "Tensor"; type->tp_basicsize = sizeof(TensorObject); - type->tp_dealloc = (destructor)EagerTensorDealloc; + type->tp_dealloc = (destructor)TensorDealloc; type->tp_as_number = &number_methods; type->tp_as_sequence = &sequence_methods; type->tp_as_mapping = &mapping_methods; type->tp_methods = variable_methods; type->tp_getset = variable_properties; - type->tp_init = EagerTensorInit; - type->tp_new = EagerTensorNew; + type->tp_init = TensorInit; + type->tp_new = TensorNew; Py_INCREF(internals.instance_base); type->tp_base = reinterpret_cast(internals.instance_base); type->tp_flags |= @@ -764,8 +758,8 @@ void BindEager(pybind11::module* module) { } Py_INCREF(type); - if (PyModule_AddObject(m.ptr(), "EagerTensor", - reinterpret_cast(type)) < 0) { + if (PyModule_AddObject(m.ptr(), "Tensor", reinterpret_cast(type)) < + 0) { Py_DECREF(type); Py_DECREF(m.ptr()); PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index a32edae2ad23cc215a0e91756fd6b54b145debda..c3f0aa2ec9c49d144f45d73d275c964f341a384b 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -145,9 +145,8 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_api_read_next_eager_tensor_list(PyObject* self, - PyObject* args, - PyObject* kwargs) { +static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args, + PyObject* kwargs) { EAGER_TRY auto tensor_base_list = CastPyArg2VectorOfTensorBase(PyTuple_GET_ITEM(args, 0), 0); @@ -182,8 +181,8 @@ PyMethodDef variable_functions[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, METH_VARARGS | METH_KEYWORDS, NULL}, - {"read_next_eager_tensor_list", - (PyCFunction)(void (*)(void))eager_api_read_next_eager_tensor_list, + {"read_next_tensor_list", + (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list, METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 68653790366084ed8cce1cb007cd975fd0a4bc59..b8f462dfd51d1234b86a6b294628bbefd8a5c021 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -35,15 +35,15 @@ limitations under the License. */ namespace paddle { namespace pybind { -extern void InitEagerTensorWithNumpyValue(TensorObject* self, - const pybind11::object& array, - bool zero_copy); +extern void InitTensorWithNumpyValue(TensorObject* self, + const pybind11::object& array, + bool zero_copy); extern PyTypeObject* p_tensor_type; -static PyObject* eager_tensor_method_numpy(TensorObject* self, PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY PADDLE_ENFORCE_EQ( self->tensor.initialized(), true, platform::errors::InvalidArgument( @@ -99,18 +99,17 @@ static PyObject* eager_tensor_method_numpy(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_method__is_initialized(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method__is_initialized(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY return ToPyObject(self->tensor.initialized()); EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_method__copy_to(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1); auto cp_tensor = @@ -123,10 +122,10 @@ static PyObject* eager_tensor_method__copy_to(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_method_reconstruct_from_(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method_reconstruct_from_(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY paddle::experimental::Tensor src_tensor = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0); std::string orig_name = self->tensor.name(); @@ -144,9 +143,9 @@ static PyObject* eager_tensor_method_reconstruct_from_(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_method_copy_(TensorObject* self, PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY paddle::experimental::Tensor src_tensor = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0); bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1); @@ -170,8 +169,8 @@ static PyObject* eager_tensor_method_copy_(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_retain_grads(TensorObject* self, PyObject* args, - PyObject* kwargs) { +static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args, + PyObject* kwargs) { EAGER_TRY if (egr::Controller::Instance().HasGrad()) { auto meta = egr::EagerUtils::autograd_meta(&(self->tensor)); @@ -187,10 +186,9 @@ static PyObject* eager_tensor_retain_grads(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor__clear_gradient(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor__clear_gradient(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY VLOG(4) << "ClearGradient " << self->tensor.name(); paddle::experimental::Tensor* grad; @@ -223,8 +221,8 @@ static PyObject* eager_tensor__clear_gradient(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor__zero_grads(TensorObject* self, PyObject* args, - PyObject* kwargs) { +static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args, + PyObject* kwargs) { EAGER_TRY VLOG(4) << "ZeroGrads " << self->tensor.name(); @@ -257,10 +255,9 @@ static PyObject* eager_tensor__zero_grads(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor__share_buffer_to(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor__share_buffer_to(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY paddle::experimental::Tensor* dst_ptr = &(reinterpret_cast(PyTuple_GET_ITEM(args, 0))->tensor); PADDLE_ENFORCE_EQ(self->tensor.initialized(), true, @@ -279,10 +276,10 @@ static PyObject* eager_tensor__share_buffer_to(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor__is_shared_buffer_with(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor__is_shared_buffer_with(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY paddle::experimental::Tensor* dst_ptr = &(reinterpret_cast(PyTuple_GET_ITEM(args, 0))->tensor); PADDLE_ENFORCE_EQ(self->tensor.initialized(), true, @@ -303,10 +300,10 @@ static PyObject* eager_tensor__is_shared_buffer_with(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor__share_underline_tensor_to(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor__share_underline_tensor_to(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY paddle::experimental::Tensor* src_ptr = &(reinterpret_cast(PyTuple_GET_ITEM(args, 0))->tensor); PADDLE_ENFORCE_EQ(self->tensor.initialized(), true, @@ -320,9 +317,10 @@ static PyObject* eager_tensor__share_underline_tensor_to(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor__is_shared_underline_tensor_with( - TensorObject* self, PyObject* args, PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor__is_shared_underline_tensor_with(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY paddle::experimental::Tensor src_tensor = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0); PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, @@ -339,9 +337,9 @@ static PyObject* eager_tensor__is_shared_underline_tensor_with( EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_method_detach(TensorObject* self, PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method_detach(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY PADDLE_ENFORCE_EQ( self->tensor.initialized(), true, platform::errors::InvalidArgument("Tensor %s has not been initialized!", @@ -365,10 +363,10 @@ static PyObject* eager_tensor_method_detach(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* eager_tensor_method_get_underline_tensor(TensorObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_SYNC_TRY +static PyObject* tensor_method_get_underline_tensor(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY if (self->tensor.is_dense_tensor()) { auto* tensor = static_cast(self->tensor.impl().get()); @@ -382,57 +380,54 @@ static PyObject* eager_tensor_method_get_underline_tensor(TensorObject* self, } // NOTE(wuweilong): Set value and not change self's original place -static PyObject* eager_tensor_method_set_value(TensorObject* self, - PyObject* args, - PyObject* kwargs) { +static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, + PyObject* kwargs) { EAGER_TRY VLOG(4) << "Value " << self->tensor.name(); pybind11::object numpy_value = pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true); - InitEagerTensorWithNumpyValue(self, numpy_value, false); + InitTensorWithNumpyValue(self, numpy_value, false); Py_INCREF(Py_None); return Py_None; EAGER_CATCH_AND_THROW_RETURN_NULL } PyMethodDef variable_methods[] = { - {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy, + {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, {"_is_initialized", - (PyCFunction)(void (*)(void))eager_tensor_method__is_initialized, + (PyCFunction)(void (*)(void))tensor_method__is_initialized, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_copy_to", (PyCFunction)(void (*)(void))eager_tensor_method__copy_to, + {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to, METH_VARARGS | METH_KEYWORDS, NULL}, - {"copy_", (PyCFunction)(void (*)(void))eager_tensor_method_copy_, + {"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_, METH_VARARGS | METH_KEYWORDS, NULL}, {"reconstruct_from_", - (PyCFunction)(void (*)(void))eager_tensor_method_reconstruct_from_, + (PyCFunction)(void (*)(void))tensor_method_reconstruct_from_, METH_VARARGS | METH_KEYWORDS, NULL}, - {"retain_grads", (PyCFunction)(void (*)(void))eager_tensor_retain_grads, + {"retain_grads", (PyCFunction)(void (*)(void))tensor_retain_grads, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_clear_gradient", - (PyCFunction)(void (*)(void))eager_tensor__clear_gradient, + {"_clear_gradient", (PyCFunction)(void (*)(void))tensor__clear_gradient, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads, + {"_zero_grads", (PyCFunction)(void (*)(void))tensor__zero_grads, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_share_buffer_to", - (PyCFunction)(void (*)(void))eager_tensor__share_buffer_to, + {"_share_buffer_to", (PyCFunction)(void (*)(void))tensor__share_buffer_to, METH_VARARGS | METH_KEYWORDS, NULL}, {"_is_shared_buffer_with", - (PyCFunction)(void (*)(void))eager_tensor__is_shared_buffer_with, + (PyCFunction)(void (*)(void))tensor__is_shared_buffer_with, METH_VARARGS | METH_KEYWORDS, NULL}, {"_share_underline_tensor_to", - (PyCFunction)(void (*)(void))eager_tensor__share_underline_tensor_to, + (PyCFunction)(void (*)(void))tensor__share_underline_tensor_to, METH_VARARGS | METH_KEYWORDS, NULL}, {"_is_shared_underline_tensor_with", - (PyCFunction)(void (*)(void))eager_tensor__is_shared_underline_tensor_with, + (PyCFunction)(void (*)(void))tensor__is_shared_underline_tensor_with, METH_VARARGS | METH_KEYWORDS, NULL}, - {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach, + {"detach", (PyCFunction)(void (*)(void))tensor_method_detach, METH_VARARGS | METH_KEYWORDS, NULL}, {"get_tensor", - (PyCFunction)(void (*)(void))eager_tensor_method_get_underline_tensor, + (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_set_value", (PyCFunction)(void (*)(void))eager_tensor_method_set_value, + {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index cd3617287d326fd45eb1386096d72d750a021e8f..8fea463baae5276d0c80a24057466b72ff32731b 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -79,10 +79,10 @@ const char* CAST_VAR_LIST_TEMPLATE = R"( auto %s = GetTensorListFromArgs("%s", "%s", args, %d, %s);)"; const char* CAST_VAR_PTR_TEMPLATE = R"( - auto %s = GetEagerTensorPtrFromArgs("%s", "%s", args, %d, %s);)"; + auto %s = GetTensorPtrFromArgs("%s", "%s", args, %d, %s);)"; const char* CAST_VAR_PTR_LIST_TEMPLATE = R"( - auto %s = GetEagerTensorPtrListFromArgs("%s", "%s", args, %d, %s);)"; + auto %s = GetTensorPtrListFromArgs("%s", "%s", args, %d, %s);)"; const char* CAST_SIZE_T_TEMPLATE = R"( auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)"; diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 942df3f69dac04fc91c524c0a3bb85bdad552dd0..fb1dc4d26b5ff8dbc88754984ab643e0b194b941 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -35,14 +35,14 @@ namespace pybind { extern PyTypeObject* p_tensor_type; -PyObject* eager_tensor_properties_get_name(TensorObject* self, void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_name(TensorObject* self, void* closure) { + EAGER_TRY return ToPyObject(self->tensor.name()); EAGER_CATCH_AND_THROW_RETURN_NULL } -PyObject* eager_tensor_properties_get_type(TensorObject* self, void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_type(TensorObject* self, void* closure) { + EAGER_TRY if (self->tensor.is_dense_tensor()) { return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR); } else { @@ -52,24 +52,24 @@ PyObject* eager_tensor_properties_get_type(TensorObject* self, void* closure) { EAGER_CATCH_AND_THROW_RETURN_NULL } -int eager_tensor_properties_set_name(TensorObject* self, PyObject* value, - void* closure) { - EAGER_SYNC_TRY +int tensor_properties_set_name(TensorObject* self, PyObject* value, + void* closure) { + EAGER_TRY self->tensor.set_name(CastPyArg2AttrString(value, 0)); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } -PyObject* eager_tensor_properties_get_stop_gradient(TensorObject* self, - void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_stop_gradient(TensorObject* self, + void* closure) { + EAGER_TRY auto meta = egr::EagerUtils::autograd_meta(&self->tensor); return ToPyObject(meta->StopGradient()); EAGER_CATCH_AND_THROW_RETURN_NULL } -PyObject* eager_tensor_properties_get_grad(TensorObject* self, void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { + EAGER_TRY if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { std::shared_ptr grad_node = egr::EagerUtils::grad_node(self->tensor); @@ -94,9 +94,9 @@ PyObject* eager_tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_CATCH_AND_THROW_RETURN_NULL } -int eager_tensor_properties_set_grad(TensorObject* self, PyObject* value, - void* closure) { - EAGER_SYNC_TRY +int tensor_properties_set_grad(TensorObject* self, PyObject* value, + void* closure) { + EAGER_TRY auto src = CastPyArg2Tensor(value, 0); PADDLE_ENFORCE( egr::egr_utils_api::IsLeafTensor(self->tensor), @@ -115,34 +115,33 @@ int eager_tensor_properties_set_grad(TensorObject* self, PyObject* value, EAGER_CATCH_AND_THROW_RETURN_ZERO } -int eager_tensor_properties_set_stop_gradient(TensorObject* self, - PyObject* value, void* closure) { - EAGER_SYNC_TRY +int tensor_properties_set_stop_gradient(TensorObject* self, PyObject* value, + void* closure) { + EAGER_TRY auto meta = egr::EagerUtils::autograd_meta(&self->tensor); meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0)); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } -PyObject* eager_tensor_properties_get_persistable(TensorObject* self, - void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_persistable(TensorObject* self, void* closure) { + EAGER_TRY auto meta = egr::EagerUtils::autograd_meta(&self->tensor); return ToPyObject(meta->Persistable()); EAGER_CATCH_AND_THROW_RETURN_NULL } -int eager_tensor_properties_set_persistable(TensorObject* self, PyObject* value, - void* closure) { - EAGER_SYNC_TRY +int tensor_properties_set_persistable(TensorObject* self, PyObject* value, + void* closure) { + EAGER_TRY auto meta = egr::EagerUtils::autograd_meta(&self->tensor); meta->SetPersistable(CastPyArg2AttrBoolean(value, 0)); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } -PyObject* eager_tensor_properties_get_shape(TensorObject* self, void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) { + EAGER_TRY auto ddim = self->tensor.shape(); std::vector value; size_t rank = static_cast(ddim.size()); @@ -155,50 +154,45 @@ PyObject* eager_tensor_properties_get_shape(TensorObject* self, void* closure) { EAGER_CATCH_AND_THROW_RETURN_NULL } -PyObject* eager_tensor_properties_get_place(TensorObject* self, void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_place(TensorObject* self, void* closure) { + EAGER_TRY return ToPyObject(self->tensor.inner_place()); EAGER_CATCH_AND_THROW_RETURN_NULL } -PyObject* eager_tensor_properties_get_place_str(TensorObject* self, - void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_place_str(TensorObject* self, void* closure) { + EAGER_TRY std::stringstream ostr; ostr << self->tensor.inner_place(); return ToPyObject(ostr.str()); EAGER_CATCH_AND_THROW_RETURN_NULL } -PyObject* eager_tensor_properties_get_dtype(TensorObject* self, void* closure) { - EAGER_SYNC_TRY +PyObject* tensor_properties_get_dtype(TensorObject* self, void* closure) { + EAGER_TRY return ToPyObject( paddle::framework::TransToProtoVarType(self->tensor.type())); EAGER_CATCH_AND_THROW_RETURN_NULL } struct PyGetSetDef variable_properties[] = { - {"grad", (getter)eager_tensor_properties_get_grad, - (setter)eager_tensor_properties_set_grad, nullptr, nullptr}, - {"name", (getter)eager_tensor_properties_get_name, - (setter)eager_tensor_properties_set_name, nullptr, nullptr}, - {"stop_gradient", (getter)eager_tensor_properties_get_stop_gradient, - (setter)eager_tensor_properties_set_stop_gradient, nullptr, nullptr}, - {"persistable", (getter)eager_tensor_properties_get_persistable, - (setter)eager_tensor_properties_set_persistable, nullptr, nullptr}, - {"shape", (getter)eager_tensor_properties_get_shape, nullptr, nullptr, - nullptr}, - // {"is_leaf", (getter)eager_tensor_properties_get_is_leaf, nullptr, + {"grad", (getter)tensor_properties_get_grad, + (setter)tensor_properties_set_grad, nullptr, nullptr}, + {"name", (getter)tensor_properties_get_name, + (setter)tensor_properties_set_name, nullptr, nullptr}, + {"stop_gradient", (getter)tensor_properties_get_stop_gradient, + (setter)tensor_properties_set_stop_gradient, nullptr, nullptr}, + {"persistable", (getter)tensor_properties_get_persistable, + (setter)tensor_properties_set_persistable, nullptr, nullptr}, + {"shape", (getter)tensor_properties_get_shape, nullptr, nullptr, nullptr}, + // {"is_leaf", (getter)tensor_properties_get_is_leaf, nullptr, // nullptr, // nullptr}, - {"place", (getter)eager_tensor_properties_get_place, nullptr, nullptr, - nullptr}, - {"_place_str", (getter)eager_tensor_properties_get_place_str, nullptr, - nullptr, nullptr}, - {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr, - nullptr}, - {"type", (getter)eager_tensor_properties_get_type, nullptr, nullptr, + {"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr}, + {"_place_str", (getter)tensor_properties_get_place_str, nullptr, nullptr, nullptr}, + {"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr}, + {"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr}, {nullptr, nullptr, nullptr, nullptr, nullptr}}; } // namespace pybind diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 85a39710564bc8c1b56a76035f7b2c56628ecf95..dd882ab6d970aa0572e69706ee3e90b539bf7951 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -179,7 +179,7 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "EagerTensor, but got %s", + "EagerVariable, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } } @@ -309,7 +309,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "EagerTensor, but got %s", + "EagerVariable, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } } @@ -597,6 +597,7 @@ std::vector GetTensorListFromArgs( if (PyList_Check(list)) { Py_ssize_t len = PyList_Size(list); + result.reserve(static_cast(len)); if (len == 0) { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument '%s' (position %d) must be list of Tensors, but got " @@ -609,6 +610,7 @@ std::vector GetTensorListFromArgs( } } else if (PyTuple_Check(list)) { Py_ssize_t len = PyTuple_Size(list); + result.reserve(static_cast(len)); if (len == 0) { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument '%s' (position %d) must be list of Tensors, but got " @@ -632,9 +634,11 @@ std::vector GetTensorListFromArgs( return result; } -paddle::experimental::Tensor* GetEagerTensorPtrFromArgs( - const std::string& op_type, const std::string& arg_name, PyObject* args, - ssize_t arg_idx, bool dispensable) { +paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type, + const std::string& arg_name, + PyObject* args, + ssize_t arg_idx, + bool dispensable) { PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); if (PyTuple_Check(obj)) { @@ -654,7 +658,7 @@ paddle::experimental::Tensor* GetEagerTensorPtrFromArgs( return &(reinterpret_cast(obj)->tensor); } -std::vector GetEagerTensorPtrListFromArgs( +std::vector GetTensorPtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable) { PyObject* list = PyTuple_GET_ITEM(args, arg_idx); diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index ead9f474f675b8e1f5b6949ff59a8f185839cb43..f2429768fa998bef97ca772004fa4b30d76d026d 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -65,15 +65,15 @@ PyObject* ToPyObject( const std::unordered_map>& value); template -struct TupleEagerTensorResult { +struct TupleTensorResult { static void Run(const Tuple& out, PyObject* result) { - TupleEagerTensorResult::Run(out, result); + TupleTensorResult::Run(out, result); PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); } }; template -struct TupleEagerTensorResult { +struct TupleTensorResult { static void Run(const Tuple& out, PyObject* result) { PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); } @@ -84,7 +84,7 @@ PyObject* ToPyObject(const std::tuple& out) { auto len = sizeof...(Args); PyObject* result = PyTuple_New(len); - TupleEagerTensorResult::Run(out, result); + TupleTensorResult::Run(out, result); return result; } @@ -97,10 +97,12 @@ std::vector GetTensorListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); -paddle::experimental::Tensor* GetEagerTensorPtrFromArgs( - const std::string& op_type, const std::string& arg_name, PyObject* args, - ssize_t arg_idx, bool dispensable = false); -std::vector GetEagerTensorPtrListFromArgs( +paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type, + const std::string& arg_name, + PyObject* args, + ssize_t arg_idx, + bool dispensable = false); +std::vector GetTensorPtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h index 7e44841e670939ef00d010c0c1fadaccd501f6ca..cf82f464a11f292b8ba09dc4cdba4eb3db6e1d96 100644 --- a/paddle/fluid/pybind/exception.h +++ b/paddle/fluid/pybind/exception.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "pybind11/pybind11.h" #define EAGER_TRY try { -#define EAGER_SYNC_TRY try { #define EAGER_CATCH_AND_THROW_RETURN_NULL \ } \ catch (...) { \ diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index c84a71d8aaa002b8d40ff2713252d2cd6afff2bb..f4ed1ee3424f229d77c293d19edca911aea31f69 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -136,10 +136,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); + } else if (py::isinstance(place_obj)) { + return place_obj.cast(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace")); + "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/" + "CustomPlace")); } } @@ -183,6 +186,9 @@ static void InitVarBaseAndTensor( SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_mlu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); + } else if (platform::is_custom_place(place)) { + SetTensorFromPyArray(tensor, array, place, + zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " @@ -941,6 +947,10 @@ void BindImperative(py::module *m_ptr) { py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("zero_copy") = false, py::arg("name") = "", py::arg("stop_gradient") = -1) + .def("__init__", &InitVarBaseFromNumpyWithArg, + py::arg("value"), py::arg("place"), py::arg("persistable") = false, + py::arg("zero_copy") = false, py::arg("name") = "", + py::arg("stop_gradient") = -1) .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value")) .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"), py::arg("name") = "") @@ -956,6 +966,8 @@ void BindImperative(py::module *m_ptr) { py::arg("tensor"), py::arg("place"), py::arg("name") = "") .def("__init__", &InitVarBaseFromTensorWithArg, py::arg("tensor"), py::arg("place"), py::arg("name") = "") + .def("__init__", &InitVarBaseFromTensorWithArg, + py::arg("tensor"), py::arg("place"), py::arg("name") = "") .def("__init__", &InitVarBaseFromNumpyWithKwargs) .def( "__setitem_varbase__", @@ -2258,6 +2270,11 @@ void BindImperative(py::module *m_ptr) { self.SetExpectedPlace(*p); VLOG(4) << "Tracer(" << &self << ")" << " set expected place " << *p; + } else if (py::isinstance(obj)) { + auto p = obj.cast(); + self.SetExpectedPlace(*p); + VLOG(4) << "Tracer(" << &self << ")" + << " set expected place " << *p; } else if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); @@ -2301,6 +2318,21 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetMutableAllowOps()), *(imperative::AmpOperators::Instance().GetMutableBlockOps())); }) + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::CustomPlace &place, + bool trace_backward, + const std::map &inplace_map = {}) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp( + type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward, inplace_map); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index f63c3111bdb3fa6e4d8060f9df0def21b3ba41b2..2b07a439d33b4a96a10a893a95e0dd26f83dd8c7 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -84,6 +84,9 @@ std::map> op_ins_map = { {"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}}, {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}}, {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}}, + {"nce", + {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs", + "CustomDistAlias", "CustomDistAliasProbs"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 959e34afe3da66987f040c81b21b410d66c7a555..5289b862dc948baacf7c373ebcee483dc589d9a6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -69,6 +69,7 @@ limitations under the License. */ #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" @@ -1667,6 +1668,139 @@ All parameter, weight, gradient are variables in Paddle. #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_(m, "Communicator").def(py::init<>()); #endif + m.def("get_all_device_type", []() { + std::vector device_types; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + device_types = platform::DeviceManager::GetAllDeviceTypes(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_all_device_type because you have installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_all_device_type, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return device_types; + }); + m.def("get_all_custom_device_type", []() { + std::vector device_types; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_all_custom_device_type because you have installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_all_custom_device_type, please try to " + "install CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return device_types; + }); + m.def("get_available_device", [] { + std::vector devices; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + devices = platform::DeviceManager::GetAllDeviceList(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_available_device because you have installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_available_device, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return devices; + }); + m.def("get_available_custom_device", [] { + std::vector devices; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + devices = platform::DeviceManager::GetAllCustomDeviceList(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_available_custom_device because you have " + "installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_available_custom_device, please try to " + "install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return devices; + }); + py::class_(m, "CustomPlace", + R"DOC( + CustomPlace is a descriptor of a device. + It represents a custom device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + + import paddle + fake_cpu_place = paddle.CustomPlace("FakeCPU", 0) + )DOC") + .def("__init__", + [](platform::CustomPlace &self, const std::string &device_type, + int dev_id) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), device id must be 0 " + "or " + "positive integer", + device_type, dev_id); + std::exit(-1); + } + + if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) && + platform::DeviceManager::IsCustom(device_type))) { + int dev_count = static_cast( + platform::DeviceManager::GetDeviceCount(device_type)); + if (UNLIKELY(dev_id >= dev_count)) { + if (dev_count == 0) { + LOG(ERROR) << "Cannot use " << device_type + << " because there is no " << device_type + << " detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), dev_id must " + "inside " + "[0, %d), because %s " + "number on your machine is %d", + device_type, dev_id, dev_count, device_type, dev_count); + std::exit(-1); + } + } + new (&self) platform::CustomPlace(device_type, dev_id); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), the device type is " + "not registered " + "as a custom device.", + device_type, dev_id); + std::exit(-1); + } +#else + LOG(ERROR) << string::Sprintf( + "Cannot use CustomDevice because you have installed CPU/GPU" + "version PaddlePaddle.\n" + "If you want to use CustomDevice, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n" + "If you only have CPU, please change " + "CustomPlace(%s, %d) to be CPUPlace().\n", + device_type, dev_id); + std::exit(-1); +#endif + }) + .def("get_device_id", + [](const platform::CustomPlace &self) { return self.GetDeviceId(); }) + .def("get_device_type", + [](const platform::CustomPlace &self) { + return self.GetDeviceType(); + }) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); py::class_ cudaplace(m, "CUDAPlace", R"DOC( CUDAPlace is a descriptor of a device. @@ -2118,11 +2252,16 @@ All parameter, weight, gradient are variables in Paddle. }) .def("is_mlu_place", [](platform::Place &self) { return platform::is_mlu_place(self); }) + .def( + "is_custom_place", + [](platform::Place &self) { return platform::is_custom_place(self); }) .def("gpu_device_id", [](platform::Place &self) { return self.device; }) .def("xpu_device_id", [](platform::Place &self) { return self.device; }) .def("npu_device_id", [](platform::Place &self) { return self.device; }) .def("ipu_device_id", [](platform::Place &self) { return self.device; }) .def("mlu_device_id", [](platform::Place &self) { return self.device; }) + .def("custom_device_id", + [](platform::Place &self) { return self.device; }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", @@ -2154,6 +2293,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self, const platform::MLUPlace &mlu_place) { self = mlu_place; }) + .def("set_place", + [](platform::Place &self, const platform::CustomPlace &plug_place) { + self = plug_place; + }) .def("__repr__", string::to_string) .def("__str__", string::to_string); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 9a11c5946f318b7e861b853d301e103e641d2722..f1983175bdf94fa6e9fcee49e6f85e7bdf6f4765 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -247,6 +248,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { auto p = self.place(); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); +#endif + } else if (platform::is_custom_place(self.place())) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + const T *a = self.data(); + auto p = self.place(); + paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), + nullptr); #endif } VLOG(10) << "TensorGetElement, place: " << self.place() @@ -289,6 +297,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); +#endif + } else if (platform::is_custom_place(self->place())) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + auto p = self->place(); + T *a = self->mutable_data(p); + paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), + nullptr); #endif } } @@ -368,6 +383,24 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use MLUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with MLU support.")); +#endif + } else if (paddle::platform::is_custom_place(place)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + platform::Place tmp_place = place; + platform::DeviceGuard guard(tmp_place); + auto dst = self->mutable_data(place); + + platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D( + reinterpret_cast(dst), + const_cast(reinterpret_cast(array.data())), + array.nbytes()); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); + ctx.Wait(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CustomDevice in CPU/GPU/XPU version. " + "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -757,6 +790,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); bool is_npu_tensor = platform::is_npu_place(tensor.place()); bool is_mlu_tensor = platform::is_mlu_place(tensor.place()); + bool is_custom_device_tensor = platform::is_custom_place(tensor.place()); const auto &tensor_dims = tensor.dims(); auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype()); size_t sizeof_dtype = framework::SizeOfType(tensor_dtype); @@ -776,7 +810,8 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, std::string py_dtype_str = details::TensorDTypeToPyDTypeStr( framework::TransToProtoVarType(tensor.dtype())); - if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor) { + if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor && + !is_custom_device_tensor) { if (!need_deep_copy) { auto base = py::cast(std::move(tensor)); return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides, @@ -900,6 +935,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, " "Please recompile or reinstall Paddle with MLU support.")); +#endif + } else if (is_custom_device_tensor) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); + PADDLE_ENFORCE_EQ(py_arr.writeable(), true, + platform::errors::InvalidArgument( + "PyArray is not writable, in which case memory leak " + "or double free would occur")); + PADDLE_ENFORCE_EQ( + py_arr.owndata(), true, + platform::errors::InvalidArgument( + "PyArray does not own data, in which case memory leak " + "or double free would occur")); + + size_t copy_bytes = sizeof_dtype * numel; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(tensor.place()); + paddle::memory::Copy( + platform::CPUPlace(), py_arr.mutable_data(), tensor.place(), + tensor_buf_ptr, copy_bytes, + reinterpret_cast(ctx).stream()); + ctx.Wait(); + return py_arr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, " + "Please recompile or reinstall Paddle with CustomPlace " + "support.")); #endif } PADDLE_THROW(platform::errors::Unimplemented("Place is not supported")); diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 62e54e26eda990e32a62c4ba99070b78d5c6275d..c8253effe8488946dfaa3c3bd4812c73d7f938d8 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -107,6 +107,6 @@ endif() cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto infrt_naive) cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto) -add_dependencies(infrt ${infrt_mlir_incs}) +add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS}) diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index ce38c53617c711e25bf559f8aa668e5da253955d..757d47a8de43e2a394ad5296e617ed6ed94078f3 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -31,9 +31,9 @@ target_link_libraries(infrtopt infrt) add_executable(print-ir print_ir.cc) target_link_libraries(print-ir infrt ${mlir_libs}) add_dependencies(print-ir pd_ops_inc) - cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) +add_subdirectory(infrt) add_subdirectory(tensorrt) if (INFRT_WITH_PTEN) diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..98910d8d0ecf0b99bd1eb8b860ed573ae88ef203 --- /dev/null +++ b/paddle/infrt/dialect/infrt/CMakeLists.txt @@ -0,0 +1,7 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + infrt_dialect.cc + ) + +add_mlir_dialect(infrt_ops Infrt) diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc new file mode 100644 index 0000000000000000000000000000000000000000..388de858b6572ea5900851b170d09589387c0b05 --- /dev/null +++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" + +#include +#include +#include +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc" + +#define GET_TYPEDEF_CLASSES +#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" + +namespace infrt { + +void InfrtDialect::initialize() { + addTypes< +#define GET_TYPEDEF_LIST +#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" // NOLINT + >(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" // NOLINT + >(); +} + +/// Parse a type registered to this dialect. +mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { + llvm::StringRef keyword; + if (parser.parseKeyword(&keyword)) return nullptr; + // parse TensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5> + // 5 is the lod_level + if (keyword == "lod_tensor") { + // Parse the size and elementType. + llvm::SmallVector shape; + mlir::Type elementType; + int32_t lod_level = 0; + // parse "<" + if (parser.parseLess()) return nullptr; + + if (parser.parseDimensionList(shape)) return nullptr; + + // Parse the element type. + if (parser.parseType(elementType)) return nullptr; + // parse "," + if (parser.parseComma()) return nullptr; + + // llvm::APInt lod_level; + if (parser.parseInteger(lod_level)) return nullptr; + + // parse ">" + if (parser.parseGreater()) return nullptr; + + return LoDTensorType::get( + parser.getContext(), shape, elementType, lod_level); + } + // Todo: parse other type + return mlir::Type(); +} + +void InfrtDialect::printType(::mlir::Type type, + ::mlir::DialectAsmPrinter &os) const { + // print TensorType, for example: !infrt.tensor + if (type.isa()) { + auto lodTensorType = type.cast(); + os << "lod_tensor<"; + auto shape = lodTensorType.getShape(); + for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim) + os << *dim << 'x'; + os << shape.back() << 'x' << lodTensorType.getElementType() << ", " + << lodTensorType.getLod_level() << ">"; + return; + } + llvm_unreachable("unknown infrt type."); +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/infrt_dialect.h new file mode 100644 index 0000000000000000000000000000000000000000..21a1f6b34f6a5f33bd82c4e78669ee24221a08f1 --- /dev/null +++ b/paddle/infrt/dialect/infrt/infrt_dialect.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +//===----------------------------------------------------------------------===// +// Dialect +//===----------------------------------------------------------------------===// +#include +#include +#include +#include + +#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc" +#define GET_TYPEDEF_CLASSES +#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc" diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td new file mode 100644 index 0000000000000000000000000000000000000000..319760973cd90c667793e29761c030141990c242 --- /dev/null +++ b/paddle/infrt/dialect/infrt/infrt_ops.td @@ -0,0 +1,52 @@ +#ifndef Infrt_OpS +#define Infrt_OpS + +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +def Infrt_Dialect : Dialect { + let summary = + "A dialect containing the Infrt Attributes, Operations, and Types"; + + let name = "Infrt"; + let cppNamespace = "::infrt"; +} + +// Type definitions + +// Base class for Infrt dialect types. +class Infrt_Type traits = [], + string baseCppClass = "::mlir::Type"> + : TypeDef { +} + +def LoDTensor : Infrt_Type<"LoDTensor"> { + let summary = "infrt lod tensor"; + let description = [{lod_tensor<3x64x3x3xf32, 3>}]; + let parameters = (ins + ArrayRefParameter<"int64_t">:$shape, + "mlir::Type":$elementType, + "int32_t":$lod_level + ); +} + +// Op definition +class Infrt_Op traits = []> : Op { + + // Each registered op needs to provide all of a printer, parser and verifier. + // let printer = [{ return infrt::print(p, *this); }]; + // let verifier = [{ return infrt::verify(*this); }]; + // let parser = [{ return infrt::parse$cppClass(parser, result); }]; +} + +// def InfRT_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> { +// let summary = "kernel op"; +// let description = [{ +// kernel op! +// }]; +// let arguments = (ins StrAttr:$name, PD_Tensor:$X, PD_Tensor:$Y, DefaultValuedAttr:$Alpha, DefaultValuedAttr:$Beta); +// +// let results = (outs PD_Tensor:$Out); +// } + +#endif // Infrt_OpS diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index 9afefc0158715bcd17f26447631d69441b445c13..090f1aea289109feda54b12131daf2993ea4e5e0 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -18,6 +18,7 @@ #include "paddle/infrt/dialect/basic_kernels.h" #include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" #include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/pten/infrt_pten_tensor.h" @@ -28,6 +29,7 @@ namespace infrt { void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT registry.insert, "pd.dtype">; -def PD_Tensor : TensorOf<[PD_ElementType]>; +// def PD_Tensor : TensorOf<[PD_ElementType]>; +def PD_Tensor1 : TensorOf<[PD_ElementType]>; + +def PD_Tensor : AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">; def PD_Tensor_Array : VectorOf<[PD_Tensor]>; diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc index f3b85ae4b5d0b9c93dfc4d0a1d9530c1e18da925..7cf5b2fb20f527eefe31f817c7fe85c7864c8669 100644 --- a/paddle/infrt/dialect/pd_ops.cc +++ b/paddle/infrt/dialect/pd_ops.cc @@ -16,6 +16,7 @@ #include #include +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" #include "paddle/infrt/dialect/infrt_base.h" #define GET_OP_CLASSES diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir index ca61ddc0b7053dce34b115dc443e872206960631..02511b21e4792bb37c416093a7c272090eae44c1 100644 --- a/paddle/infrt/tests/dialect/paddle_ops.mlir +++ b/paddle/infrt/tests/dialect/paddle_ops.mlir @@ -3,7 +3,7 @@ func @ops() { %a = pd.feed() {name="input0"} : tensor %b = pd.feed() {name="input1"}: tensor - + %d = pd.feed() {name="input3"}: !Infrt.lod_tensor<3x4x9xf32, 0> %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor infrt.return diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h index 900de42bbac9577f25f625d4643ef7734ece9f12..1872fcc0da4d72a569083f967ed94320606ed64c 100644 --- a/paddle/pten/api/include/tensor.h +++ b/paddle/pten/api/include/tensor.h @@ -222,6 +222,14 @@ class PADDLE_API Tensor final { */ bool is_dense_tensor() const; + /** + * @brief Determine whether tensor is SelectedRows + * + * @return true + * @return false + */ + bool is_selected_rows() const; + /* Part 3: Device and Backend methods */ /** diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc index 6fb0d2706ca90267cc2e06a06ba9b570f275da2c..40f35896323b98543364428c99b20d03571dbbd7 100644 --- a/paddle/pten/api/lib/tensor.cc +++ b/paddle/pten/api/lib/tensor.cc @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/pten/core/tensor_base.h" #include "paddle/pten/core/tensor_meta.h" #include "paddle/pten/core/tensor_utils.h" - /** * [ Why still include the fluid headers? ] * @@ -133,7 +132,9 @@ DataLayout Tensor::layout() const { return impl_->layout(); } bool Tensor::is_dense_tensor() const { return pten::DenseTensor::classof(impl_.get()); } - +bool Tensor::is_selected_rows() const { + return pten::SelectedRows::classof(impl_.get()); +} /* Part 3: Device and Backend methods */ PlaceType Tensor::place() const { diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc index e2cb934f0a1c5d5fb599bddcf44345f70ac688c2..0a3bfccb16a4b2aa83425ddc41ae141251842bac 100644 --- a/paddle/pten/common/place.cc +++ b/paddle/pten/common/place.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/pten/api/ext/exception.h" @@ -50,7 +51,11 @@ const char *AllocationTypeStr(AllocationType type) { std::string Place::DebugString() const { std::ostringstream os; os << "Place("; - os << AllocationTypeStr(alloc_type_); + if (alloc_type_ == AllocationType::CUSTOM) { + os << GetGlobalDeviceType(device_type_id_); + } else { + os << AllocationTypeStr(alloc_type_); + } if (alloc_type_ == AllocationType::GPUPINNED || alloc_type_ == AllocationType::NPUPINNED || alloc_type_ == AllocationType::CPU) { @@ -66,4 +71,23 @@ std::ostream &operator<<(std::ostream &os, const Place &p) { return os; } +static std::unordered_map global_registered_device_type_id; +static std::unordered_map global_registered_device_type; + +size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) { + if (device_type.empty()) return 0; + if (global_registered_device_type_id.find(device_type) == + global_registered_device_type_id.end()) { + size_t device_type_id = global_registered_device_type_id.size() + 1; + global_registered_device_type_id[device_type] = device_type_id; + global_registered_device_type[device_type_id] = device_type; + } + return global_registered_device_type_id[device_type]; +} + +std::string GetGlobalDeviceType(size_t device_type_id) { + if (device_type_id == 0) return ""; + return global_registered_device_type[device_type_id]; +} + } // namespace pten diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h index 75f1f4de9984c72200df68f1d55cf45ce7a58c98..6b7d1ea55d5c4159bd2d005518dd3631db7c05a7 100644 --- a/paddle/pten/common/place.h +++ b/paddle/pten/common/place.h @@ -28,29 +28,49 @@ enum class AllocationType : int8_t { NPUPINNED = 6, IPU = 7, MLU = 8, + CUSTOM = 9, }; const char* AllocationTypeStr(AllocationType type); +size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); +std::string GetGlobalDeviceType(size_t device_type_id_); + /// \brief The place is used to specify where the data is stored. class Place { public: Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {} - explicit Place(AllocationType type, int8_t id) - : device(id), alloc_type_(type) {} - - explicit Place(AllocationType type) : device(0), alloc_type_(type) {} - - void Reset(AllocationType type, int8_t device_id = 0) noexcept { + explicit Place(AllocationType type, + int8_t id, + const std::string& dev_type = "") + : device(id), + alloc_type_(type), + device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + + explicit Place(AllocationType type, const std::string& dev_type = "") + : device(0), + alloc_type_(type), + device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + + void Reset(AllocationType type, + int8_t device_id = 0, + const std::string& dev_type = "") noexcept { alloc_type_ = type; device = device_id; + if (!dev_type.empty()) { + device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type); + } } AllocationType GetType() const { return alloc_type_; } int8_t GetDeviceId() const { return device; } + std::string GetDeviceType() const { + return GetGlobalDeviceType(device_type_id_); + } + std::string DebugString() const; inline bool operator==(const Place& rhs) const { @@ -62,6 +82,10 @@ class Place { alloc_type_ == AllocationType::NPUPINNED) { return true; } + if (alloc_type_ == AllocationType::CUSTOM) { + return device_type_id_ == rhs.device_type_id_ && + device == rhs.GetDeviceId(); + } return device == rhs.GetDeviceId(); } inline bool operator!=(const Place& rhs) const { return !(*this == rhs); } @@ -69,6 +93,10 @@ class Place { if (alloc_type_ != rhs.GetType()) { return static_cast(alloc_type_) < static_cast(rhs.GetType()); } + if (alloc_type_ == AllocationType::CUSTOM && + device_type_id_ != rhs.device_type_id_) { + return device_type_id_ < rhs.device_type_id_; + } return device < rhs.GetDeviceId(); } @@ -79,6 +107,7 @@ class Place { private: AllocationType alloc_type_{AllocationType::UNDEFINED}; + size_t device_type_id_; }; class CPUPlace : public Place { @@ -157,6 +186,22 @@ class MLUPlace : public Place { : Place(AllocationType::MLU, place.GetDeviceId()) {} }; +class CustomPlace : public Place { + public: + explicit CustomPlace(const std::string dev_type) + : Place(AllocationType::CUSTOM, 0, dev_type) {} + CustomPlace(const std::string dev_type, int device_id) + : Place(AllocationType::CUSTOM, device_id, dev_type) {} + + CustomPlace(const CustomPlace&) = default; + CustomPlace(const Place& place) { // NOLINT + if (place.GetType() == AllocationType::CUSTOM) { + this->Reset( + AllocationType::CUSTOM, place.GetDeviceId(), place.GetDeviceType()); + } + } +}; + std::ostream& operator<<(std::ostream&, const Place&); } // namespace pten diff --git a/paddle/pten/core/compat/type_defs.h b/paddle/pten/core/compat/type_defs.h index eb5459b1b6ea723d7118a2a05addc1988987efcc..c9d7d5bb54b620ceeac55de21a28e2440a15186b 100644 --- a/paddle/pten/core/compat/type_defs.h +++ b/paddle/pten/core/compat/type_defs.h @@ -24,7 +24,7 @@ limitations under the License. */ #include namespace egr { -class EagerTensor; +class EagerVariable; } namespace paddle { namespace framework { @@ -76,9 +76,9 @@ struct NameVarMapTrait { }; template <> -struct NameVarMapTrait { +struct NameVarMapTrait { using Type = - std::map>>; + std::map>>; }; } // namespace details @@ -88,7 +88,7 @@ using NameVarMap = typename details::NameVarMapTrait::Type; using NameVarBaseMap = NameVarMap; using NameVariableWrapperMap = NameVarMap; -using NameTensorMap = NameVarMap; +using NameTensorMap = NameVarMap; using VariableWrapperList = std::vector>; diff --git a/paddle/pten/core/infermeta_utils.h b/paddle/pten/core/infermeta_utils.h index 6de91db9382e22537e577ce3188764034c7235e3..59d2a4ed3c089d2480bfcbe526d2706371e322bc 100644 --- a/paddle/pten/core/infermeta_utils.h +++ b/paddle/pten/core/infermeta_utils.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include +#include +#include #include #include "paddle/pten/common/scalar.h" @@ -55,9 +57,12 @@ class InferMetaContext { AttrType AttrAt(size_t idx) { try { return paddle::any_cast(attrs_.at(idx)); - } catch (paddle::bad_any_cast&) { + } catch (paddle::bad_any_cast& e) { PADDLE_THROW(pten::errors::InvalidArgument( - "Attribute cast error in InferMeta Context.")); + "Attribute cast error in InferMeta Context, the expected attribute " + "type is `%s`, but actual attribute type is `%s`.", + std::type_index(typeid(AttrType)).name(), + std::type_index(attrs_.at(idx).type()).name())); } } @@ -151,10 +156,15 @@ struct InferMetaFnImpl { PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(double); + PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&); + PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE( const std::vector&); + PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); + PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); + PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE( + const std::vector&); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend); PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout); diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h index 4a05d7ed3153f1e20926bb95eaac5d2c3b5ca5db..8250179b7a28b25a673f84c235c6d0c3eeb3043c 100644 --- a/paddle/pten/core/selected_rows.h +++ b/paddle/pten/core/selected_rows.h @@ -29,10 +29,6 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/mixed_vector.h" - -namespace egr { -class EagerTensor; -} // namespace egr namespace pten { class SelectedRows : public TensorBase, public TypeInfoTraits { @@ -199,39 +195,6 @@ class SelectedRows : public TensorBase, std::unique_ptr value_{nullptr}; int64_t height_; // height indicates the underline tensor's height std::unique_ptr rwlock_{nullptr}; - // TODO(jiabin): Remove this when we don't need EagerTensor support - // SelectedRows which is expected in next version. - /** Why we need this weird friend class? - * In eager mode, since some of ops doesn't support C++ API for now we need to - *use 'imperative::TraceOp' to run it. - * So, we need to support get a SelectedRows from egr::EagerTensor's - *framework::Variable obj and used it to reconstruct - * a new paddle::experimental::Tensor to support framework usage. However, we - *got 2 problems here. - * First, we got 2 unique_ptr in SelectedRows so that we can't support - *std::make_shared in EagerTensor's SetImplWithSelectedRows method, - * since we have to construct a shared_ptr for paddle::experimental::Tensor's - *impl. - * Second, when we are trying to support move constructor for SelectedRows we - *found that we can't get its rvalue from - * framework::Variable because it holds an obj of target type. - * - * - * The only three way to solve this problem is: - * 1. Just like what we have done, using friend class and just copy/move each - *member. In this way, we can avoid additional API - * and symbols. - * 2. Make pten::SelectedRows's member from unique_ptr to shared_ptr. However, - *this may cause some cost of performance. - * 3. Add some api to return or move member of framework::SelectedRows. - *However, it's not as safe as first solution. - * 4. Support all framework::SelectedRows related ops and make sure - *EagerTensor never holds framework::SelectedRows. - * - * If anyone got better ideas, welcome to contact JiabinYang, we are open for - *your help. - **/ - friend class egr::EagerTensor; }; } // namespace pten diff --git a/paddle/pten/core/utils/data_type.h b/paddle/pten/core/utils/data_type.h index ee223afb3b03c0e2b770097e4313ce31c45927ea..ca0c678e0623d7b7a38b8d87170fc448798f7ea6 100644 --- a/paddle/pten/core/utils/data_type.h +++ b/paddle/pten/core/utils/data_type.h @@ -57,7 +57,7 @@ inline void VisitDataType(pten::DataType type, Visitor visitor) { _PtenForEachDataType_(PtenVisitDataTypeCallback); #undef PtenVisitDataTypeCallback PADDLE_THROW(pten::errors::Unimplemented( - "Not supported proto::VarType::Type(%d) as data type.", + "Not supported pten::DataType(%d) as data type.", static_cast(type))); } } // namespace pten diff --git a/paddle/pten/infermeta/backward.cc b/paddle/pten/infermeta/backward.cc index db92449519436024a01c9c891f9671756777a345..2f2fcc7db31ea51f2111103675bbd20e7ab1ec58 100644 --- a/paddle/pten/infermeta/backward.cc +++ b/paddle/pten/infermeta/backward.cc @@ -16,13 +16,10 @@ limitations under the License. */ namespace pten { -void MatmulGradInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& out_grad_meta, - bool transpose_x, - bool transpose_y, - MetaTensor* dx, - MetaTensor* dy) { +void GeneralBinaryGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* dx, + MetaTensor* dy) { if (dx) { dx->share_meta(x); } diff --git a/paddle/pten/infermeta/backward.h b/paddle/pten/infermeta/backward.h index d6b96861412861de6fb892a28c3930bd7db20da7..ded51cac6378c574232eed3e641def23c68c3db8 100644 --- a/paddle/pten/infermeta/backward.h +++ b/paddle/pten/infermeta/backward.h @@ -20,12 +20,9 @@ limitations under the License. */ namespace pten { -void MatmulGradInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& out_grad_meta, - bool transpose_x, - bool transpose_y, - MetaTensor* dx, - MetaTensor* dy); +void GeneralBinaryGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* dx, + MetaTensor* dy); } // namespace pten diff --git a/paddle/pten/kernels/abs_grad_kernel.h b/paddle/pten/kernels/abs_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..494f29da783d4d6b6d3f6f940d3591ace578aea1 --- /dev/null +++ b/paddle/pten/kernels/abs_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/device_context.h" + +namespace pten { + +template +void AbsGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx); + +template +void AbsDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& ddx, + DenseTensor* ddout); + +} // namespace pten diff --git a/paddle/pten/kernels/abs_kernel.h b/paddle/pten/kernels/abs_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..0322afadfab8d2a24a358ef7c747d09174b124f2 --- /dev/null +++ b/paddle/pten/kernels/abs_kernel.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/device_context.h" + +namespace pten { + +template +void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); + +} // namespace pten diff --git a/paddle/pten/kernels/cpu/abs_grad_kernel.cc b/paddle/pten/kernels/cpu/abs_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9d6675aa7b3155d1f83f4bd098a867e5f3359938 --- /dev/null +++ b/paddle/pten/kernels/cpu/abs_grad_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/common/complex.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" +#include "paddle/pten/kernels/impl/abs_grad_kernel_impl.h" + +using pten::dtype::complex; + +PT_REGISTER_KERNEL(abs_grad, + CPU, + ALL_LAYOUT, + pten::AbsGradKernel, + float, + double, + int, + int64_t, + complex, + complex) {} +PT_REGISTER_KERNEL(abs_double_grad, + CPU, + ALL_LAYOUT, + pten::AbsDoubleGradKernel, + float, + double, + int, + int64_t, + complex, + complex) {} diff --git a/paddle/pten/kernels/cpu/abs_kernel.cc b/paddle/pten/kernels/cpu/abs_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee766a18d42aba35f4d9e45d6d381beda99798d6 --- /dev/null +++ b/paddle/pten/kernels/cpu/abs_kernel.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/abs_kernel.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" + +namespace pten { + +template +void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + auto numel = x.numel(); + auto* x_data = x.data(); + ctx.template Alloc>( + out, size_t(x.numel() * sizeof(pten::funcs::Real))); + auto* out_data = out->data>(); + + paddle::platform::ForRange for_range(ctx, numel); + pten::funcs::AbsFunctor functor(x_data, out_data, numel); + for_range(functor); +} + +} // namespace pten + +PT_REGISTER_KERNEL(abs, + CPU, + ALL_LAYOUT, + pten::AbsKernel, + float, + double, + int, + int64_t, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/histogram_kernel.cc b/paddle/pten/kernels/cpu/histogram_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..700b7e092919aa8d922b0ebfbe8388eb646aac5b --- /dev/null +++ b/paddle/pten/kernels/cpu/histogram_kernel.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/histogram_kernel.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/funcs/math_function.h" + +namespace pten { + +template +void HistogramKernel(const Context& dev_ctx, + const DenseTensor& input, + int64_t bins, + int min, + int max, + DenseTensor* output) { + auto& nbins = bins; + auto& minval = min; + auto& maxval = max; + + const T* input_data = input.data(); + auto input_numel = input.numel(); + + int64_t* out_data = output->mutable_data(dev_ctx.GetPlace()); + pten::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + + if (input_data == nullptr) return; + + T output_min = static_cast(minval); + T output_max = static_cast(maxval); + if (output_min == output_max) { + output_min = *std::min_element(input_data, input_data + input_numel); + output_max = *std::max_element(input_data, input_data + input_numel); + } + if (output_min == output_max) { + output_min = output_min - 1; + output_max = output_max + 1; + } + + PADDLE_ENFORCE_EQ( + (std::isinf(static_cast(output_min)) || + std::isnan(static_cast(output_max)) || + std::isinf(static_cast(output_min)) || + std::isnan(static_cast(output_max))), + false, + pten::errors::OutOfRange("range of min, max is not finite")); + PADDLE_ENFORCE_GE( + output_max, + output_min, + pten::errors::InvalidArgument( + "max must be larger or equal to min. If min and max are both zero, " + "the minimum and maximum values of the data are used. " + "But received max is %d, min is %d", + maxval, + minval)); + + for (int64_t i = 0; i < input_numel; i++) { + if (input_data[i] >= output_min && input_data[i] <= output_max) { + const int64_t bin = (int64_t)((input_data[i] - output_min) * nbins / + (output_max - output_min)); + out_data[std::min(bin, nbins - 1)] += 1; + } + } +} + +} // namespace pten + +PT_REGISTER_KERNEL(histogram, + CPU, + ALL_LAYOUT, + pten::HistogramKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/pten/kernels/cpu/lerp_grad_kernel.cc b/paddle/pten/kernels/cpu/lerp_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..4aac143eb16dcad6d6d31e302357babab7ed8309 --- /dev/null +++ b/paddle/pten/kernels/cpu/lerp_grad_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/lerp_grad_kernel.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/impl/lerp_grad_kernel_impl.h" + +PT_REGISTER_KERNEL( + lerp_grad, CPU, ALL_LAYOUT, pten::LerpGradKernel, float, double) {} diff --git a/paddle/pten/kernels/cpu/lerp_kernel.cc b/paddle/pten/kernels/cpu/lerp_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9f8513065ce9c5c52f85b0f7d9e7acfade534763 --- /dev/null +++ b/paddle/pten/kernels/cpu/lerp_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/lerp_kernel.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/impl/lerp_kernel_impl.h" + +PT_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, pten::LerpKernel, float, double) {} diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index ecb058d35b909bc9455b019e55ab8f2277fd587b..e1a1788815ebfef75ac29e332da3e76f3d2a5d52 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -94,6 +94,7 @@ PT_REGISTER_KERNEL(empty_like, int64_t, bool, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} #endif diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h index 9a96a5fd45e4c48059ba8915f2108e4f9ac2aad7..e751f85b50f24bdddb475653e5e706975333242c 100644 --- a/paddle/pten/kernels/funcs/common_shape.h +++ b/paddle/pten/kernels/funcs/common_shape.h @@ -102,5 +102,30 @@ inline void GetPrePostNumel( } } +static framework::DDim ExtendDims2Rank(const framework::DDim &in_dims, + int rank) { + if (in_dims.size() == rank) { + return in_dims; + } + std::vector shapes(rank, 1); + for (int i = in_dims.size() - 1, j = rank - 1; i >= 0; --i, --j) { + shapes[j] = in_dims[i]; + } + return framework::make_ddim(shapes); +} + +template +static void GetBroadcastDims(const framework::DDim &in_dims, + const framework::DDim &out_dims, + Eigen::DSizes *bcast_dims) { + for (size_t i = 0; i < D; ++i) { + if (in_dims[i] == out_dims[i]) { + (*bcast_dims)[i] = 1; + } else { + (*bcast_dims)[i] = std::max(in_dims[i], out_dims[i]); + } + } +} + } // namespace funcs } // namespace pten diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/pten/kernels/funcs/complex_functors.h similarity index 57% rename from paddle/fluid/operators/math/complex_functors.h rename to paddle/pten/kernels/funcs/complex_functors.h index 48f16b87cbd66c6a39c74d1dbaab2349193f04ae..b0eee3ac1fdce3c9fc7de7f8162ae74f4b33daff 100644 --- a/paddle/fluid/operators/math/complex_functors.h +++ b/paddle/pten/kernels/funcs/complex_functors.h @@ -13,15 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif +#include #include -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" #include "paddle/pten/core/hostdevice.h" -namespace paddle { -namespace operators { -namespace math { +namespace pten { +namespace funcs { template struct cond { @@ -64,8 +66,8 @@ using select_t = typename select::type; template using Real = - select_t>::value, float>, - cond>::value, double>, + select_t>::value, float>, + cond>::value, double>, T>; template @@ -77,13 +79,13 @@ using NoComplex = typename std::enable_if::value>::type; template using EnableComplex = typename std::enable_if< - std::is_same>::value || - std::is_same>::value>::type; + std::is_same>::value || + std::is_same>::value>::type; template using DisableComplex = typename std::enable_if< - !std::is_same>::value && - !std::is_same>::value>::type; + !std::is_same>::value && + !std::is_same>::value>::type; template struct RealFunctor; @@ -154,8 +156,7 @@ struct AbsFunctor>> { template struct AbsGradFunctor { - AbsGradFunctor(const math::Real* dout, const T* x, T* output, - int64_t numel) + AbsGradFunctor(const Real* dout, const T* x, T* output, int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -166,52 +167,55 @@ struct AbsGradFunctor { } } - const math::Real* dout_; + const Real* dout_; const T* x_; T* output_; int64_t numel_; }; template <> -struct AbsGradFunctor> { - AbsGradFunctor(const float* dout, const paddle::platform::complex* x, - paddle::platform::complex* output, int64_t numel) +struct AbsGradFunctor> { + AbsGradFunctor(const float* dout, + const pten::dtype::complex* x, + pten::dtype::complex* output, + int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex(0)) { - output_[idx] = paddle::platform::complex(0); + if (x_[idx] == pten::dtype::complex(0)) { + output_[idx] = pten::dtype::complex(0); } else { - output_[idx] = paddle::platform::complex(dout_[idx]) * - (x_[idx] / paddle::platform::complex(abs(x_[idx]))); + output_[idx] = pten::dtype::complex(dout_[idx]) * + (x_[idx] / pten::dtype::complex(abs(x_[idx]))); } } const float* dout_; - const paddle::platform::complex* x_; - paddle::platform::complex* output_; + const pten::dtype::complex* x_; + pten::dtype::complex* output_; int64_t numel_; }; template <> -struct AbsGradFunctor> { - AbsGradFunctor(const double* dout, const paddle::platform::complex* x, - paddle::platform::complex* output, int64_t numel) +struct AbsGradFunctor> { + AbsGradFunctor(const double* dout, + const pten::dtype::complex* x, + pten::dtype::complex* output, + int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex(0)) { - output_[idx] = paddle::platform::complex(0); + if (x_[idx] == pten::dtype::complex(0)) { + output_[idx] = pten::dtype::complex(0); } else { - output_[idx] = - paddle::platform::complex(dout_[idx]) * - (x_[idx] / paddle::platform::complex(abs(x_[idx]))); + output_[idx] = pten::dtype::complex(dout_[idx]) * + (x_[idx] / pten::dtype::complex(abs(x_[idx]))); } } const double* dout_; - const paddle::platform::complex* x_; - paddle::platform::complex* output_; + const pten::dtype::complex* x_; + pten::dtype::complex* output_; int64_t numel_; }; @@ -235,46 +239,48 @@ struct AbsGradGradFunctor { }; template <> -struct AbsGradGradFunctor> { - AbsGradGradFunctor(const paddle::platform::complex* ddx, - const paddle::platform::complex* x, - paddle::platform::complex* output, int64_t numel) +struct AbsGradGradFunctor> { + AbsGradGradFunctor(const pten::dtype::complex* ddx, + const pten::dtype::complex* x, + pten::dtype::complex* output, + int64_t numel) : ddx_(ddx), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex(0)) { - output_[idx] = paddle::platform::complex(0); + if (x_[idx] == pten::dtype::complex(0)) { + output_[idx] = pten::dtype::complex(0); } else { - output_[idx] = paddle::platform::complex(ddx_[idx]) * x_[idx] / - paddle::platform::complex(abs(x_[idx])); + output_[idx] = pten::dtype::complex(ddx_[idx]) * x_[idx] / + pten::dtype::complex(abs(x_[idx])); } } - const paddle::platform::complex* ddx_; - const paddle::platform::complex* x_; - paddle::platform::complex* output_; + const pten::dtype::complex* ddx_; + const pten::dtype::complex* x_; + pten::dtype::complex* output_; int64_t numel_; }; template <> -struct AbsGradGradFunctor> { - AbsGradGradFunctor(const paddle::platform::complex* ddx, - const paddle::platform::complex* x, - paddle::platform::complex* output, int64_t numel) +struct AbsGradGradFunctor> { + AbsGradGradFunctor(const pten::dtype::complex* ddx, + const pten::dtype::complex* x, + pten::dtype::complex* output, + int64_t numel) : ddx_(ddx), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex(0)) { - output_[idx] = paddle::platform::complex(0); + if (x_[idx] == pten::dtype::complex(0)) { + output_[idx] = pten::dtype::complex(0); } else { - output_[idx] = paddle::platform::complex(ddx_[idx]) * x_[idx] / - paddle::platform::complex(abs(x_[idx])); + output_[idx] = pten::dtype::complex(ddx_[idx]) * x_[idx] / + pten::dtype::complex(abs(x_[idx])); } } - const paddle::platform::complex* ddx_; - const paddle::platform::complex* x_; - paddle::platform::complex* output_; + const pten::dtype::complex* ddx_; + const pten::dtype::complex* x_; + pten::dtype::complex* output_; int64_t numel_; }; template @@ -318,8 +324,10 @@ struct RealImagToComplexFunctor; template struct RealImagToComplexFunctor>> { - RealImagToComplexFunctor(const Real* input_real, const Real* input_imag, - T* output, int64_t numel) + RealImagToComplexFunctor(const Real* input_real, + const Real* input_imag, + T* output, + int64_t numel) : input_real_(input_real), input_imag_(input_imag), output_(output), @@ -363,6 +371,84 @@ struct ConjFunctor> { T* output_; }; -} // namespace math -} // namespace operators -} // namespace paddle +template +struct AngleFunctor; + +// angel function for complex +template +struct AngleFunctor>> { + AngleFunctor(const T* input, pten::funcs::Real* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = arg(input_[idx]); + } + + const T* input_; + pten::funcs::Real* output_; + int64_t numel_; +}; + +// angel function for real +template +struct AngleFunctor>> { + AngleFunctor(const T* input, T* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = input_[idx] < static_cast(0) ? M_PI : 0; + } + + const T* input_; + T* output_; + int64_t numel_; +}; + +template +struct AngleGradFunctor; + +// angle grad for complex +template +struct AngleGradFunctor>> { + AngleGradFunctor(const pten::funcs::Real* dout, + const T* x, + T* dx, + int64_t numel) + : dout_(dout), x_(x), dx_(dx), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + if (x_[idx] == T(0)) { + dx_[idx] = T(0); + } else { + const pten::funcs::Real r_square = + x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag; + dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square, + dout_[idx] * x_[idx].real / r_square); + } + } + + const pten::funcs::Real* dout_; + const T* x_; + T* dx_; + int64_t numel_; +}; + +// angle grad for real +template +struct AngleGradFunctor>> { + AngleGradFunctor(const pten::funcs::Real* dout, + const T* x, + T* dx, + int64_t numel) + : dout_(dout), x_(x), dx_(dx), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; } + + const pten::funcs::Real* dout_; + const T* x_; + T* dx_; + int64_t numel_; +}; + +} // namespace funcs +} // namespace pten diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/pten/kernels/funcs/compound_functors.h similarity index 86% rename from paddle/fluid/operators/math/compound_functors.h rename to paddle/pten/kernels/funcs/compound_functors.h index 6a43215bf52a9b231a47241d1bb27695da031957..c3d14a50659396345b94a0aaaff2972b5e0fe08e 100644 --- a/paddle/fluid/operators/math/compound_functors.h +++ b/paddle/pten/kernels/funcs/compound_functors.h @@ -18,9 +18,8 @@ limitations under the License. */ #include #include -namespace paddle { -namespace operators { -namespace math { +namespace pten { +namespace funcs { // Z = BinaryFunctor(X, UnaryFunctor(Y)) template @@ -69,8 +68,8 @@ struct BinaryCompoundGradDxFunctor { return dout * d_binary_fun_.Dx(x, unary_fun_(y)); } - inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, - T dout) { + inline HOSTDEVICE T + UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) { return dout * d_binary_fun_.Dx(x, intermediate_out); } @@ -82,8 +81,11 @@ struct BinaryCompoundGradDxFunctor { }; // Z = BinaryFunctor(X, UnaryFunctor(Y)) -template +template struct BinaryCompoundGradDyFunctor { BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun, const UnaryFun &unary_fun, @@ -96,8 +98,8 @@ struct BinaryCompoundGradDyFunctor { return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_.UseX(y); } - inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, - T dout) { + inline HOSTDEVICE T + UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) { if (InPlace) { return dout * d_binary_fun_.Dy(x, intermediate_out) * d_unary_fun_.UseOut(intermediate_out); @@ -116,8 +118,11 @@ struct BinaryCompoundGradDyFunctor { }; // Z = UnaryFunctor(BinaryFunctor(X, Y)) -template +template struct UnaryCompoundGradDxFunctor { UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun, const BinaryFun &binary_fun, @@ -136,8 +141,8 @@ struct UnaryCompoundGradDxFunctor { return base * d_binary_fun_.Dx(x, y); } - inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, - T dout) { + inline HOSTDEVICE T + UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) { T base; if (InPlace) { base = dout * d_unary_fun_.UseOut(out); @@ -156,8 +161,11 @@ struct UnaryCompoundGradDxFunctor { }; // Z = UnaryFunctor(BinaryFunctor(X, Y)) -template +template struct UnaryCompoundGradDyFunctor { UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun, const BinaryFun &binary_fun, @@ -176,8 +184,8 @@ struct UnaryCompoundGradDyFunctor { return base * d_binary_fun_.Dy(x, y); } - inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, - T dout) { + inline HOSTDEVICE T + UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) { T base; if (InPlace) { base = dout * d_unary_fun_.UseOut(out); @@ -206,7 +214,9 @@ struct BinaryCompoundGradDIntermedaiteOutFunctor { return dout * d_binary_fun_.Dy(x, unary_fun_(y)); } - inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out, + inline HOSTDEVICE T UseIntermediateOut(T x, + T intermediate_out, + T out, T dout) { return dout * d_binary_fun_.Dy(x, intermediate_out); } @@ -233,7 +243,9 @@ struct UnaryCompoundGradDIntermediateFunctor { } } - inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out, + inline HOSTDEVICE T UseIntermediateOut(T x, + T intermediate_out, + T out, T dout) { if (InPlace) { return dout * d_unary_fun_.UseOut(out); @@ -249,6 +261,5 @@ struct UnaryCompoundGradDIntermediateFunctor { BinaryFun binary_fun_; }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace pten diff --git a/paddle/fluid/operators/math/functors.h b/paddle/pten/kernels/funcs/functors.h similarity index 85% rename from paddle/fluid/operators/math/functors.h rename to paddle/pten/kernels/funcs/functors.h index 054018b10e87e421c45846abf550f0f7a552f6a3..8b2bdfd0b1e32b38c0a9500b67dfa452bcaee97e 100644 --- a/paddle/fluid/operators/math/functors.h +++ b/paddle/pten/kernels/funcs/functors.h @@ -17,16 +17,17 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math.h" -namespace paddle { -namespace operators { -namespace math { - -// MulFunctor -template -struct MulFunctor { - // out = x * y; - inline HOSTDEVICE T operator()(T x, T y) { return x * y; } -}; +namespace pten { +namespace funcs { + +// // MulFunctor +// // NOTE(chenfeiyu): IT IS NOLONGER USED, use pten::funcs::MultiplyFunctor +// instead +// template +// struct MulFunctor { +// // out = x * y; +// inline HOSTDEVICE T operator()(T x, T y) { return x * y; } +// }; template struct MulGradFunctor { @@ -34,12 +35,13 @@ struct MulGradFunctor { inline HOSTDEVICE T Dy(T x, T y) { return x; } }; -// AddFunctor -template -struct AddFunctor { - // out = x + y; - inline HOSTDEVICE T operator()(T x, T y) { return x + y; } -}; +// // AddFunctor +// // NOTE(chenfeiyu): IT IS NOLONGER USED, use pten::funcs::AddFunctor instead +// template +// struct AddFunctor { +// // out = x + y; +// inline HOSTDEVICE T operator()(T x, T y) { return x + y; } +// }; template struct MaxFunctor { @@ -102,7 +104,8 @@ struct TanhFunctor { // y = 2 / (1 + e^-2x) - 1 T t0 = static_cast(2) * x; T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0); - return static_cast(2) / (static_cast(1) + real_exp(-t1)) - + return static_cast(2) / + (static_cast(1) + paddle::operators::real_exp(-t1)) - static_cast(1); } }; @@ -123,7 +126,8 @@ struct SigmoidFunctor { inline HOSTDEVICE T operator()(T x) { // y = 1 / (1 + e^-x) T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x); - return static_cast(1) / (static_cast(1) + real_exp(-tmp)); + return static_cast(1) / + (static_cast(1) + paddle::operators::real_exp(-tmp)); } }; @@ -138,7 +142,7 @@ struct SigmoidGradFunctor { template struct GeluFunctor { - using MT = typename details::MPTypeTrait::Type; + using MT = typename paddle::operators::details::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T x) { // this function is tanh approximation of gelu // actual gelu is: @@ -154,7 +158,7 @@ struct GeluFunctor { template struct GeluGradFunctor { - using MT = typename details::MPTypeTrait::Type; + using MT = typename paddle::operators::details::MPTypeTrait::Type; inline HOSTDEVICE T UseX(T x) { MT mx = static_cast(x); MT tanh_out = @@ -193,6 +197,5 @@ struct GeluGradFunctor { } }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace pten diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc index 780068e0381aa87221dadc4b79bb8edb2fdf3842..dec89e79565dea863b1f2837334db372ed415522 100644 --- a/paddle/pten/kernels/funcs/math_function.cc +++ b/paddle/pten/kernels/funcs/math_function.cc @@ -215,14 +215,21 @@ void set_constant_with_place( paddle::platform::errors::Unimplemented("IPUPlace is not supported")); } +template <> +void set_constant_with_place( + const paddle::platform::DeviceContext& context, + paddle::framework::Tensor* tensor, + float value) { + PADDLE_THROW( + paddle::platform::errors::Unimplemented("CustomPlace is not supported")); +} + template <> void set_constant_with_place( const paddle::platform::DeviceContext& context, paddle::framework::Tensor* tensor, float value) { - paddle::framework::VisitDataType( - paddle::framework::TransToProtoVarType(tensor->type()), - TensorSetConstantCPU(tensor, value)); + pten::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value)); } template <> @@ -239,9 +246,7 @@ void set_constant_with_place( const paddle::platform::DeviceContext& context, paddle::framework::Tensor* tensor, float value) { - paddle::framework::VisitDataType( - paddle::framework::TransToProtoVarType(tensor->type()), - TensorSetConstantCPU(tensor, value)); + pten::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value)); } struct TensorSetConstantWithPlace : public boost::static_visitor { diff --git a/paddle/pten/kernels/funcs/math_function.cu b/paddle/pten/kernels/funcs/math_function.cu index f7cee12b2dfd42c2296a4bd30a739bfe181efb13..8ed72dbd1c1278d320ccebfd7463e83f7c101065 100644 --- a/paddle/pten/kernels/funcs/math_function.cu +++ b/paddle/pten/kernels/funcs/math_function.cu @@ -226,9 +226,8 @@ void set_constant_with_place( const paddle::platform::DeviceContext& context, paddle::framework::Tensor* tensor, float value) { - paddle::framework::VisitDataType( - paddle::framework::TransToProtoVarType(tensor->type()), - TensorSetConstantGPU(context, tensor, value)); + pten::VisitDataType(tensor->dtype(), + TensorSetConstantGPU(context, tensor, value)); } template diff --git a/paddle/pten/kernels/funcs/math_function.h b/paddle/pten/kernels/funcs/math_function.h index 73b9dd00bc64095ea2796154ff5d32c407fd9f1b..14f5b5b41489d09e53a47a1ece22d394c22f1c53 100644 --- a/paddle/pten/kernels/funcs/math_function.h +++ b/paddle/pten/kernels/funcs/math_function.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/utils/data_type.h" namespace pten { namespace funcs { diff --git a/paddle/pten/kernels/funcs/math_function_impl.h b/paddle/pten/kernels/funcs/math_function_impl.h index 19f3082c05cc27c265fe1354fba666226b88ce1c..a66692363572adf06a0d064fbdf9c82e44eb6d6a 100644 --- a/paddle/pten/kernels/funcs/math_function_impl.h +++ b/paddle/pten/kernels/funcs/math_function_impl.h @@ -30,8 +30,8 @@ void SetConstant::operator()( #ifdef PADDLE_WITH_XPU if (paddle::platform::is_xpu_place(context.GetPlace())) { xpu_place = true; - paddle::framework::VisitDataType( - paddle::framework::TransToProtoVarType(tensor->type()), + pten::VisitDataType( + tensor->dtype(), TensorSetConstantXPU(tensor, num, context.GetPlace())); } #endif diff --git a/paddle/pten/kernels/gpu/abs_grad_kernel.cu b/paddle/pten/kernels/gpu/abs_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a7257e129ec47ac0b5b33923832855b7907cb719 --- /dev/null +++ b/paddle/pten/kernels/gpu/abs_grad_kernel.cu @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/abs_grad_kernel.h" +#include "paddle/pten/kernels/impl/abs_grad_kernel_impl.h" + +using pten::dtype::complex; + +PT_REGISTER_KERNEL(abs_grad, + GPU, + ALL_LAYOUT, + pten::AbsGradKernel, + float, + double, + int, + int64_t, + pten::dtype::float16, + complex, + complex) {} +PT_REGISTER_KERNEL(abs_double_grad, + GPU, + ALL_LAYOUT, + pten::AbsDoubleGradKernel, + float, + double, + int, + int64_t, + pten::dtype::float16, + complex, + complex) {} diff --git a/paddle/pten/kernels/gpu/abs_kernel.cu b/paddle/pten/kernels/gpu/abs_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..06eff050674c3670a2aa07cb43d0baea82fe7202 --- /dev/null +++ b/paddle/pten/kernels/gpu/abs_kernel.cu @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/abs_kernel.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" + +namespace pten { + +template +struct CudaAbsFunctor; + +template +struct CudaAbsFunctor>> { + __device__ __forceinline__ pten::funcs::Real operator()(const T x) const { + return abs(x); + } +}; + +template +struct CudaAbsFunctor>> { + __device__ __forceinline__ T operator()(const T x) const { + return std::abs(x); + } +}; + +template +void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + ctx.template Alloc>(out); + std::vector ins = {&x}; + std::vector outs = {out}; + auto functor = CudaAbsFunctor(); + + funcs::LaunchSameDimsElementwiseCudaKernel>( + ctx, ins, &outs, functor); +} + +} // namespace pten + +PT_REGISTER_KERNEL(abs, + GPU, + ALL_LAYOUT, + pten::AbsKernel, + float, + double, + int, + int64_t, + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/histogram_kernel.cu b/paddle/pten/kernels/gpu/histogram_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d0da49e01aebbff375015ddfd7bc90309f9e4d8 --- /dev/null +++ b/paddle/pten/kernels/gpu/histogram_kernel.cu @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/funcs/math_function.h" +#include "paddle/pten/kernels/histogram_kernel.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +#include "paddle/pten/kernels/funcs/eigen/common.h" +#include "paddle/pten/kernels/funcs/eigen/eigen_function.h" + +namespace pten { + +using IndexType = int64_t; +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +inline int GET_BLOCKS(const int N) { + return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; +} + +template +__device__ static IndexType GetBin(T input_value, + T min_value, + T max_value, + int64_t nbins) { + IndexType bin = static_cast((input_value - min_value) * nbins / + (max_value - min_value)); + IndexType output_index = bin < nbins - 1 ? bin : nbins - 1; + return output_index; +} + +template +__global__ void KernelHistogram(const T* input, + const int total_elements, + const int64_t nbins, + const T min_value, + const T max_value, + int64_t* output) { + extern __shared__ int64_t buf_hist[]; + for (int i = threadIdx.x; i < nbins; i += blockDim.x) { + buf_hist[i] = 0; + } + __syncthreads(); + + CUDA_KERNEL_LOOP(input_index, total_elements) { + // const IndexType input_index = threadIdx.x + blockIdx.x * blockDim.x; + const auto input_value = input[input_index]; + if (input_value >= min_value && input_value <= max_value) { + const IndexType output_index = + GetBin(input_value, min_value, max_value, nbins); + paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1); + } + } + __syncthreads(); + + for (int i = threadIdx.x; i < nbins; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]); + } +} + +template +void HistogramKernel(const Context& dev_ctx, + const DenseTensor& input, + int64_t bins, + int min, + int max, + DenseTensor* output) { + auto& nbins = bins; + auto& minval = min; + auto& maxval = max; + + const T* input_data = input.data(); + const int input_numel = input.numel(); + + int64_t* out_data = output->mutable_data(dev_ctx.GetPlace()); + pten::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + + if (input_data == nullptr) return; + + T output_min = static_cast(minval); + T output_max = static_cast(maxval); + + if (output_min == output_max) { + auto input_x = pten::EigenVector::Flatten(input); + + DenseTensor input_min_t, input_max_t; + auto* input_min_data = input_min_t.mutable_data({1}, dev_ctx.GetPlace()); + auto* input_max_data = input_max_t.mutable_data({1}, dev_ctx.GetPlace()); + auto input_min_scala = pten::EigenScalar::From(input_min_t); + auto input_max_scala = pten::EigenScalar::From(input_max_t); + + auto* place = dev_ctx.eigen_device(); + input_min_scala.device(*place) = input_x.minimum(); + input_max_scala.device(*place) = input_x.maximum(); + + DenseTensor input_min_cpu, input_max_cpu; + paddle::framework::TensorCopySync( + input_min_t, paddle::platform::CPUPlace(), &input_min_cpu); + paddle::framework::TensorCopySync( + input_max_t, paddle::platform::CPUPlace(), &input_max_cpu); + + output_min = input_min_cpu.data()[0]; + output_max = input_max_cpu.data()[0]; + } + if (output_min == output_max) { + output_min = output_min - 1; + output_max = output_max + 1; + } + + PADDLE_ENFORCE_EQ( + (std::isinf(static_cast(output_min)) || + std::isnan(static_cast(output_max)) || + std::isinf(static_cast(output_min)) || + std::isnan(static_cast(output_max))), + false, + pten::errors::OutOfRange("range of min, max is not finite")); + PADDLE_ENFORCE_GE( + output_max, + output_min, + pten::errors::InvalidArgument( + "max must be larger or equal to min. If min and max are both zero, " + "the minimum and maximum values of the data are used. " + "But received max is %d, min is %d", + maxval, + minval)); + + auto stream = dev_ctx.stream(); + KernelHistogram<<>>( + input_data, input_numel, nbins, output_min, output_max, out_data); +} + +} // namespace pten + +PT_REGISTER_KERNEL(histogram, + GPU, + ALL_LAYOUT, + pten::HistogramKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/fluid/operators/lerp_op.cu b/paddle/pten/kernels/gpu/lerp_grad_kernel.cu similarity index 54% rename from paddle/fluid/operators/lerp_op.cu rename to paddle/pten/kernels/gpu/lerp_grad_kernel.cu index 6f7d8b744d694f0cd5cbc9bb218034be435ba6f0..30fdb1206f45e5ffd68b3ab75c9bbc065d458f8e 100644 --- a/paddle/fluid/operators/lerp_op.cu +++ b/paddle/pten/kernels/gpu/lerp_grad_kernel.cu @@ -12,16 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/lerp_op.h" +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/impl/lerp_grad_kernel_impl.h" +#include "paddle/pten/kernels/lerp_grad_kernel.h" -REGISTER_OP_CUDA_KERNEL( - lerp, - paddle::operators::LerpKernel, - paddle::operators::LerpKernel); - -REGISTER_OP_CUDA_KERNEL( - lerp_grad, - paddle::operators::LerpGradKernel, - paddle::operators::LerpGradKernel); +PT_REGISTER_KERNEL( + lerp_grad, GPU, ALL_LAYOUT, pten::LerpGradKernel, float, double) {} diff --git a/paddle/pten/kernels/gpu/lerp_kernel.cu b/paddle/pten/kernels/gpu/lerp_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8743cb12e491b1a6fa96d44b115304a9b8b1c7c9 --- /dev/null +++ b/paddle/pten/kernels/gpu/lerp_kernel.cu @@ -0,0 +1,20 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/impl/lerp_kernel_impl.h" +#include "paddle/pten/kernels/lerp_kernel.h" + +PT_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, pten::LerpKernel, float, double) {} diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu index 31c44673f94e737bd94882b2537ddf3fababf226..7df99260aa1614a29325ed1d0834400566e28139 100644 --- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu @@ -26,6 +26,7 @@ PT_REGISTER_KERNEL(matmul_grad, float, double, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu index f9fdbd27bf94e4b236efe5a49e471e39c4c57dd5..b365581e949c103be511e4849a45b4fd9a024f77 100644 --- a/paddle/pten/kernels/gpu/matmul_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_kernel.cu @@ -27,5 +27,6 @@ PT_REGISTER_KERNEL(matmul, float, double, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} diff --git a/paddle/pten/kernels/histogram_kernel.h b/paddle/pten/kernels/histogram_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4bc4ef6fb9e4657305f4f967371711a0aaabb035 --- /dev/null +++ b/paddle/pten/kernels/histogram_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +namespace pten { + +template +void HistogramSelectKernel(const Context& dev_ctx, + const DenseTensor& input, + int64_t bins, + int min, + int max, + DenseTensor* out); + +} // namespace pten diff --git a/paddle/pten/kernels/impl/abs_grad_kernel_impl.h b/paddle/pten/kernels/impl/abs_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..ff829e10b2d8bf06971173978627632bf18fa93f --- /dev/null +++ b/paddle/pten/kernels/impl/abs_grad_kernel_impl.h @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/abs_grad_kernel.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" + +namespace pten { + +template +void AbsGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { + auto numel = dout.numel(); + auto* dout_data = dout.data>(); + auto* x_data = x.data(); + + ctx.template Alloc(dx, static_cast(numel * sizeof(T))); + auto* dx_data = dx->data(); + + paddle::platform::ForRange for_range(ctx, numel); + pten::funcs::AbsGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); +} + +template +void AbsDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& ddx, + DenseTensor* ddout) { + auto numel = ddx.numel(); + auto* ddx_data = ddx.data(); + auto* x_data = x.data(); + ctx.template Alloc(ddout, static_cast(numel * sizeof(T))); + auto* ddout_data = ddout->data(); + + paddle::platform::ForRange for_range(ctx, numel); + pten::funcs::AbsGradGradFunctor functor( + ddx_data, x_data, ddout_data, numel); + for_range(functor); +} + +} // namespace pten diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h index 7e4c4f0d66d4fc89634eb7bde9eb24e2743d4a7c..17cfb886e57b813fa744ebc232d6cc38e6b0f951 100644 --- a/paddle/pten/kernels/impl/complex_kernel_impl.h +++ b/paddle/pten/kernels/impl/complex_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/math/complex_functors.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace pten { @@ -29,7 +29,7 @@ void ConjKernel(const Context& dev_ctx, auto* out_data = dev_ctx.template Alloc(out); paddle::platform::ForRange for_range(dev_ctx, numel); - paddle::operators::math::ConjFunctor functor(x_data, numel, out_data); + pten::funcs::ConjFunctor functor(x_data, numel, out_data); for_range(functor); } diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h index d4ea9fc944527145269fdfd1a854aca1299a6018..4ed47bd69dd5f6b37b179cc9534fde64f949b5de 100644 --- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/pten/kernels/complex_kernel.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" namespace pten { @@ -35,9 +35,7 @@ struct DotGradFunction { }; template -struct DotGradFunction> { +struct DotGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* tensor_x, const DenseTensor* tensor_y, @@ -133,9 +131,7 @@ struct DotGradFunction -struct DotGradFunction> { +struct DotGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* tensor_x, const DenseTensor* tensor_y, @@ -221,9 +217,7 @@ struct DotDoubleGradFunction { }; template -struct DotDoubleGradFunction> { +struct DotDoubleGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* tensor_x, const DenseTensor* tensor_y, @@ -334,9 +328,7 @@ struct DotDoubleGradFunction -struct DotDoubleGradFunction> { +struct DotDoubleGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* tensor_x, const DenseTensor* tensor_y, @@ -461,9 +453,7 @@ struct DotTripleGradFunction { // TODO(wuweilong): enable this function when the unittests framewark for multi // grad is ok (dtype: complex64 or complex128). template -struct DotTripleGradFunction> { +struct DotTripleGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* in_tensor_x, const DenseTensor* in_tensor_y, @@ -656,9 +646,7 @@ struct DotTripleGradFunction -struct DotTripleGradFunction> { +struct DotTripleGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* in_tensor_x, const DenseTensor* in_tensor_y, diff --git a/paddle/pten/kernels/impl/lerp_grad_kernel_impl.h b/paddle/pten/kernels/impl/lerp_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..5285c69e39a17d5971212e83d85c095bd14ed873 --- /dev/null +++ b/paddle/pten/kernels/impl/lerp_grad_kernel_impl.h @@ -0,0 +1,133 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/kernels/funcs/common_shape.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" + +namespace pten { + +template +static void LerpGradFunction(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto& w = weight; + auto& dout = out_grad; + auto* dx = x_grad; + auto* dy = y_grad; + + auto dout_dims = dout.dims(); + auto dx_dims = pten::funcs::ExtendDims2Rank(dx->dims(), D); + auto dy_dims = pten::funcs::ExtendDims2Rank(dy->dims(), D); + auto w_dims = pten::funcs::ExtendDims2Rank(w.dims(), D); + Eigen::DSizes dx_bcast_dims; + Eigen::DSizes dy_bcast_dims; + Eigen::DSizes w_bcast_dims; + pten::funcs::GetBroadcastDims(dx_dims, dout_dims, &dx_bcast_dims); + pten::funcs::GetBroadcastDims(dy_dims, dout_dims, &dy_bcast_dims); + pten::funcs::GetBroadcastDims(w_dims, dout_dims, &w_bcast_dims); + + auto eigen_w = pten::EigenTensor::From(w, w_dims); + auto eigen_dout = pten::EigenTensor::From(dout); + + Eigen::DSizes dx_reshape_dims; + Eigen::DSizes dy_reshape_dims; + Eigen::DSizes reduce_dims; + for (int i = 0; i < dout_dims.size(); ++i) { + dx_reshape_dims[2 * i] = dx_bcast_dims[i]; + dx_reshape_dims[2 * i + 1] = dx_dims[i]; + dy_reshape_dims[2 * i] = dy_bcast_dims[i]; + dy_reshape_dims[2 * i + 1] = dy_dims[i]; + reduce_dims[i] = 2 * i; + } + + auto& place = *ctx.eigen_device(); + + if (dx) { + ctx.template Alloc(dx); + auto eigen_dx = pten::EigenTensor::From(*dx, dx_dims); + auto eigen_expr = (1 - eigen_w.broadcast(w_bcast_dims)) * eigen_dout; + eigen_dx.device(place) = eigen_expr.reshape(dx_reshape_dims) + .sum(reduce_dims) + .reshape(eigen_dx.dimensions()); + } + if (dy) { + ctx.template Alloc(dy); + auto eigen_dy = pten::EigenTensor::From(*dy, dy_dims); + auto eigen_expr = eigen_w.broadcast(w_bcast_dims) * eigen_dout; + eigen_dy.device(place) = eigen_expr.reshape(dy_reshape_dims) + .sum(reduce_dims) + .reshape(eigen_dy.dimensions()); + } +} + +template +void LerpGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + int rank = out.dims().size(); + PADDLE_ENFORCE_GE( + rank, + 1, + pten::errors::InvalidArgument( + "The number of dimensions for LerpGradOp must be " + "greater than or equal to 1, but the value received is %d.", + rank)); + PADDLE_ENFORCE_LE( + rank, + 6, + pten::errors::InvalidArgument( + "The number of dimensions for LerpGradOp must be " + "less than or equal to 6, but the value received is %d.", + rank)); + switch (rank) { + case 1: + LerpGradFunction( + ctx, x, y, weight, out, out_grad, x_grad, y_grad); + break; + case 2: + LerpGradFunction( + ctx, x, y, weight, out, out_grad, x_grad, y_grad); + break; + case 3: + LerpGradFunction( + ctx, x, y, weight, out, out_grad, x_grad, y_grad); + break; + case 4: + LerpGradFunction( + ctx, x, y, weight, out, out_grad, x_grad, y_grad); + break; + case 5: + LerpGradFunction( + ctx, x, y, weight, out, out_grad, x_grad, y_grad); + break; + case 6: + LerpGradFunction( + ctx, x, y, weight, out, out_grad, x_grad, y_grad); + break; + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/impl/lerp_kernel_impl.h b/paddle/pten/kernels/impl/lerp_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..127e3e50a3651dd9cc998b9920de17566bc996ba --- /dev/null +++ b/paddle/pten/kernels/impl/lerp_kernel_impl.h @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/kernels/funcs/common_shape.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" + +namespace pten { + +template +static void LerpFunction(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + DenseTensor* out) { + ctx.template Alloc(out); + + auto out_dims = out->dims(); + auto x_dims = pten::funcs::ExtendDims2Rank(x.dims(), D); + auto y_dims = pten::funcs::ExtendDims2Rank(y.dims(), D); + auto w_dims = pten::funcs::ExtendDims2Rank(weight.dims(), D); + Eigen::DSizes x_bcast_dims; + Eigen::DSizes y_bcast_dims; + Eigen::DSizes w_bcast_dims; + pten::funcs::GetBroadcastDims(x_dims, out_dims, &x_bcast_dims); + pten::funcs::GetBroadcastDims(y_dims, out_dims, &y_bcast_dims); + pten::funcs::GetBroadcastDims(w_dims, out_dims, &w_bcast_dims); + + auto eigen_x = pten::EigenTensor::From(x, x_dims); + auto eigen_y = pten::EigenTensor::From(y, y_dims); + auto eigen_w = pten::EigenTensor::From(weight, w_dims); + auto eigen_out = pten::EigenTensor::From(*out); + + auto& place = *ctx.eigen_device(); + eigen_out.device(place) = + eigen_x.broadcast(x_bcast_dims) + + eigen_w.broadcast(w_bcast_dims) * + (eigen_y.broadcast(y_bcast_dims) - eigen_x.broadcast(x_bcast_dims)); +} + +template +void LerpKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + DenseTensor* out) { + int rank = out->dims().size(); + PADDLE_ENFORCE_GE( + rank, + 1, + pten::errors::InvalidArgument( + "The number of dimensions for LerpOp must be " + "greater than or equal to 1, but the value received is %d.", + rank)); + PADDLE_ENFORCE_LE( + rank, + 6, + pten::errors::InvalidArgument( + "The number of dimensions for LerpOp must be " + "less than or equal to 6, but the value received is %d.", + rank)); + switch (rank) { + case 1: + LerpFunction(ctx, x, y, weight, out); + break; + case 2: + LerpFunction(ctx, x, y, weight, out); + break; + case 3: + LerpFunction(ctx, x, y, weight, out); + break; + case 4: + LerpFunction(ctx, x, y, weight, out); + break; + case 5: + LerpFunction(ctx, x, y, weight, out); + break; + case 6: + LerpFunction(ctx, x, y, weight, out); + break; + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h index 858807a1d4d6496d5e3091aa71f5b2dada03b92e..addea622f140210ae714da2eda775f6ce6568eca 100644 --- a/paddle/pten/kernels/impl/matmul_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/core/dense_tensor.h" diff --git a/paddle/pten/kernels/lerp_grad_kernel.h b/paddle/pten/kernels/lerp_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..18a38e724505f214e29fd4d18f187f0c59012700 --- /dev/null +++ b/paddle/pten/kernels/lerp_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +template +void LerpGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace pten diff --git a/paddle/pten/kernels/lerp_kernel.h b/paddle/pten/kernels/lerp_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8e70a53c06b5cb3930f89c241a8bfd82a511a6b6 --- /dev/null +++ b/paddle/pten/kernels/lerp_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +template +void LerpKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + DenseTensor* out); + +} // namespace pten diff --git a/paddle/pten/kernels/selected_rows/scale_kernel.cc b/paddle/pten/kernels/selected_rows/scale_kernel.cc index 09700d8afe0508e51cbdaff8404d97c4e25f5b9d..32f7a41a5b9688710450713a4b96c68906d26ad5 100644 --- a/paddle/pten/kernels/selected_rows/scale_kernel.cc +++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,10 +16,8 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/gpu/gpu_context.h" -#include "paddle/pten/core/kernel_registry.h" - -// See Note [ Why still include the fluid headers? ] #include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/core/kernel_registry.h" namespace pten { template diff --git a/paddle/pten/ops/compat/abs_sig.cc b/paddle/pten/ops/compat/abs_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a610db46a16f4da0df1ebbf2cd0d5fda174cde50 --- /dev/null +++ b/paddle/pten/ops/compat/abs_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/core/compat/op_utils.h" + +namespace pten { + +KernelSignature AbsOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("abs", {"X"}, {}, {"Out"}); +} + +KernelSignature AbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "abs_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); +} + +KernelSignature AbsDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("abs_double_grad", {"X", "DDX"}, {}, {"DDOut"}); +} + +} // namespace pten + +PT_REGISTER_ARG_MAPPING_FN(abs, pten::AbsOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(abs_grad, pten::AbsGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(abs_double_grad, + pten::AbsDoubleGradOpArgumentMapping); diff --git a/paddle/pten/ops/compat/elementwise_sig.cc b/paddle/pten/ops/compat/elementwise_sig.cc index c1941f6dde30baca60c3647ca0e2267c8a0d65f1..6541334ee27ec21d92ebcab67af1186bafadbfb2 100644 --- a/paddle/pten/ops/compat/elementwise_sig.cc +++ b/paddle/pten/ops/compat/elementwise_sig.cc @@ -75,6 +75,31 @@ KernelSignature ElementwiseAddGradOpArgumentMapping( return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ElementwiseAddDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); +} + +KernelSignature ElementwiseAddTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("add_triple_grad", + {"DDX", "DDY", "D_DDOut"}, + {"axis"}, + {"D_DDX", "D_DDY"}); +} + +KernelSignature ElementwiseSubGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + return KernelSignature("subtract_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + } // namespace pten PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add); @@ -82,6 +107,9 @@ PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract); PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply); PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide); PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad); +PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad); +PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad); +PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad); PT_REGISTER_ARG_MAPPING_FN(elementwise_add, pten::ElementwiseAddOpArgumentMapping); @@ -93,3 +121,9 @@ PT_REGISTER_ARG_MAPPING_FN(elementwise_div, pten::ElementwiseDivOpArgumentMapping); PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad, pten::ElementwiseAddGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad, + pten::ElementwiseAddDoubleGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad, + pten::ElementwiseAddTripleGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad, + pten::ElementwiseSubGradOpArgumentMapping); diff --git a/paddle/pten/ops/compat/histogram_sig.cc b/paddle/pten/ops/compat/histogram_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..9849c998d779e46bb955f0bc98686c247fc99b18 --- /dev/null +++ b/paddle/pten/ops/compat/histogram_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/core/compat/op_utils.h" + +namespace pten { + +KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("histogram", {"X"}, {"bins", "min", "max"}, {"Out"}); +} + +} // namespace pten + +PT_REGISTER_ARG_MAPPING_FN(histogram, pten::HistogramOpArgumentMapping); diff --git a/paddle/pten/ops/compat/lerp_sig.cc b/paddle/pten/ops/compat/lerp_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..d225ff2bfd3e2bd68130c6d8a14b71df0069c5c5 --- /dev/null +++ b/paddle/pten/ops/compat/lerp_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/core/compat/op_utils.h" + +namespace pten { + +KernelSignature LerpOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("lerp", {"X", "Y", "Weight"}, {}, {"Out"}); +} + +KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("lerp_grad", + {"X", "Y", "Weight", "Out", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace pten + +PT_REGISTER_ARG_MAPPING_FN(lerp, pten::LerpOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(lerp_grad, pten::LerpGradOpArgumentMapping); diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt index 32e6e0784dad0c716cfea384b46933f11adbe5d0..971d9112eead97f46ab1f165c9073ac525464676 100644 --- a/paddle/pten/tests/core/CMakeLists.txt +++ b/paddle/pten/tests/core/CMakeLists.txt @@ -1,7 +1,6 @@ cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) -cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils) cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel) cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor) cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 711b8811b973c7172af5733c70efd46cd6f25e77..35b2ce751b18fff2aac8dedfd09e5fe209d95533 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -91,6 +91,7 @@ if "%WITH_PYTHON%" == "ON" ( where pip pip install wheel --user pip install pyyaml --user + pip install wget --user pip install -r %work_dir%\python\requirements.txt --user if !ERRORLEVEL! NEQ 0 ( echo pip install requirements.txt failed! @@ -175,19 +176,20 @@ rem -------Caching strategy 1: End -------------------------------- rem -------Caching strategy 2: sccache decorate compiler----------- +if not defined SCCACHE_ROOT set SCCACHE_ROOT=D:\sccache if "%WITH_SCCACHE%"=="ON" ( - del D:\sccache\sccache_log.txt cmd /C sccache -V || call :install_sccache sccache --stop-server 2> NUL + del %SCCACHE_ROOT%\sccache_log.txt :: Localy storage on windows - if not exist D:\sccache mkdir D:\sccache - set SCCACHE_DIR=D:\sccache\.cache + if not exist %SCCACHE_ROOT% mkdir %SCCACHE_ROOT% + set SCCACHE_DIR=%SCCACHE_ROOT%\.cache :: Sccache will shut down if a source file takes more than 10 mins to compile set SCCACHE_IDLE_TIMEOUT=0 set SCCACHE_CACHE_SIZE=100G - set SCCACHE_ERROR_LOG=D:\sccache\sccache_log.txt + set SCCACHE_ERROR_LOG=%SCCACHE_ROOT%\sccache_log.txt set SCCACHE_LOG=quiet :: Distributed storage on windows @@ -208,7 +210,7 @@ if "%WITH_SCCACHE%"=="ON" ( echo There is not sccache in this PC, will install sccache. echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')" -xcopy sccache.exe %PYTHON_ROOT%\Scripts\ /Y +xcopy sccache.exe %PYTHON_ROOT%\ /Y goto:eof rem -------Caching strategy 2: End -------------------------------- diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 8ce9716b169b9c64b82d66b949609ff502775942..12d31aee41e394968d58753f2b54fcce8648a35e 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -293,6 +293,7 @@ from .framework import CUDAPlace # noqa: F401 from .framework import NPUPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401 from .framework import MLUPlace # noqa: F401 +from .framework import CustomPlace # noqa: F401 from .autograd import grad # noqa: F401 from .autograd import no_grad # noqa: F401 diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index d102473fef791124e0605008dd1844507c3b4a61..89e0ae49fc48f73840129826952a01aec07dd3ab 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -36,7 +36,11 @@ __all__ = [ # noqa 'is_compiled_with_cuda', 'is_compiled_with_rocm', 'is_compiled_with_npu', - 'is_compiled_with_mlu' + 'is_compiled_with_mlu', + 'get_all_device_type', + 'get_all_custom_device_type', + 'get_available_device', + 'get_available_custom_device', ] _cudnn_version = None @@ -225,15 +229,26 @@ def _convert_to_place(device): selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") device_id = int(selected_mlus[0]) place = core.MLUPlace(device_id) + elif device in core.get_all_custom_device_type(): + place = core.CustomPlace(device, 0) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device: - raise ValueError( - "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'mlu', 'mlu:x', 'npu', 'npu:x' or ipu" - ) + device_info_list = device.split(':', 1) + device_type = device_info_list[0] + if device_type in core.get_all_custom_device_type(): + device_id = device_info_list[1] + device_id = int(device_id) + place = core.CustomPlace(device_type, device_id) + else: + raise ValueError( + "The device must be a string which is like 'cpu', {}". + format(', '.join("'{}', '{}:x'".format(x, x) + for x in ['gpu', 'xpu', 'npu', 'mlu'] + + core.get_all_custom_device_type()))) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( @@ -338,3 +353,103 @@ def get_device(): raise ValueError("The device specification {} is invalid".format(place)) return device + + +def get_all_device_type(): + """ + Get all available device types. + + Returns: + A list of all available device types. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_all_device_type() + + # Case 1: paddlepaddle-cpu package installed, and no custom device registerd. + # Output: ['cpu'] + + # Case 2: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: ['cpu', 'gpu'] + + # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd. + # Output: ['cpu', 'CustomCPU'] + + # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['cpu', 'gpu', 'CustomCPU', 'CustomGPU'] + """ + return core.get_all_device_type() + + +def get_all_custom_device_type(): + """ + Get all available custom device types. + + Returns: + A list of all available custom device types. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_all_custom_device_type() + + # Case 1: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: None + + # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['CustomCPU', 'CustomGPU'] + """ + return core.get_all_custom_device_type() + + +def get_available_device(): + """ + Get all available devices. + + Returns: + A list of all available devices. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_available_device() + + # Case 1: paddlepaddle-cpu package installed, and no custom device registerd. + # Output: ['cpu'] + + # Case 2: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: ['cpu', 'gpu:0', 'gpu:1'] + + # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd. + # Output: ['cpu', 'CustomCPU'] + + # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['cpu', 'gpu:0', 'gpu:1', 'CustomCPU', 'CustomGPU:0', 'CustomGPU:1'] + """ + return core.get_available_device() + + +def get_available_custom_device(): + """ + Get all available custom devices. + + Returns: + A list of all available custom devices. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_available_custom_device() + + # Case 1: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: None + + # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['CustomCPU', 'CustomGPU:0', 'CustomGPU:1'] + """ + return core.get_available_custom_device() diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 59a92930d22a91f752421d1bf0f64e1f38f12e02..bc50bef010941a48c367046221d17c75138753c2 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -54,14 +54,15 @@ class ParameterServerOptimizer(MetaOptimizerBase): attrs['user_defined_strategy'] = self.user_defined_strategy attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy) attrs['ps_mode'] = attrs['trainer'].mode - + logger.info("ps_mode: {}".format(attrs['ps_mode'])) attrs['role_maker'] = self.role_maker attrs[ 'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode attrs['is_worker'] = self.role_maker._is_worker() attrs['is_server'] = self.role_maker._is_server() attrs['is_heter_worker'] = self.role_maker._is_heter_worker() - + logger.info("this process is heter? {}".format(attrs[ + 'is_heter_worker'])) attrs['use_ps_gpu'] = self.user_defined_strategy.a_sync_configs[ "use_ps_gpu"] attrs['lr_decay_steps'] = self.user_defined_strategy.a_sync_configs[ diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py index fff10a2d4684afe51295cc460f8dc3424d13c4f5..3f39db69abdb2930ec40ffb02cb34dce7be6a034 100755 --- a/python/paddle/distributed/passes/ps_trainer_pass.py +++ b/python/paddle/distributed/passes/ps_trainer_pass.py @@ -47,7 +47,7 @@ class AppendSendOpsPass(PassBase): # 该 pass 被多种模式复用 if ps_mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]: dummy_output = program.global_block().create_var( name=framework.generate_control_dev_var_name()) - + logger.info("dummy_output: {}".format(dummy_output)) program.global_block().append_op( type="send", inputs={"X": send_input_vars}, @@ -61,7 +61,7 @@ class AppendSendOpsPass(PassBase): # 该 pass 被多种模式复用 return dummy_output - def _append_barrier_op(self, program, dummys): + def _append_barrier_op(self, program, dummys, trainer_id): program.global_block().append_op( type="send_barrier", inputs={"X": dummys}, @@ -79,19 +79,24 @@ class AppendSendOpsPass(PassBase): # 该 pass 被多种模式复用 send_ctx = get_geo_trainer_send_context(attrs) # geo 模式 else: send_ctx = get_the_one_send_context(attrs) # async、sync 等各种模式 + logger.info("send_ctx: {}".format(send_ctx)) dummys = [] for merged_name, send in send_ctx.items(): if send.is_sparse() and ps_mode != DistributedMode.GEO: continue + logger.info('merged_name, send: {}, {}'.format(merged_name, send)) is_sparse = 1 if send.is_sparse() else 0 is_sparse = 2 if send.is_distributed() else is_sparse dummys.append( self._append_send_op(main_program, send.origin_varnames(), merged_name, is_sparse, send.table_id(), ps_mode)) - + logger.info('ps trainer pass - ps mode: {}'.format(ps_mode)) + logger.info('dummys: {}'.format(dummys)) if ps_mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]: - self._append_barrier_op(main_program, dummys) + logger.info('insert send_barrier_op') + trainer_id = get_role_id(attrs['role_maker']) + self._append_barrier_op(main_program, dummys, trainer_id) @register_pass("distributed_ops_pass") diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py index d649a74e4d621bbc531ce194242fbbd07b01209a..c6afd0cb03bf3f8d164082d9cbadf8dd7c08254f 100755 --- a/python/paddle/distributed/ps/utils/ps_program_builder.py +++ b/python/paddle/distributed/ps/utils/ps_program_builder.py @@ -97,7 +97,7 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder): def __init__(self, pass_ctx): logger.info("start building cpu-sync-ps program") super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx) - if self.ps_mode == DistributedMode.GEO: + if self.ps_mode != DistributedMode.SYNC: raise ValueError("ps mode: {} not matched {}", format(ps_mode, "CpuSyncPsProgramBuilder")) diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index a8587874776bb5f5586dd23ed32c1ee810ad97c0..7743db1057dd66e7467efee0cc0253c083ff335c 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -83,8 +83,10 @@ class DistributedMode: class TrainerRuntimeConfig(object): def __init__(self, valid_strategy): - + self.mode = None k_steps = valid_strategy.a_sync_configs["k_steps"] + logger.info("ps mode in strategy: {}, {}".format( + valid_strategy.a_sync, valid_strategy.a_sync_configs["k_steps"])) if not valid_strategy.a_sync and k_steps == 0: self.mode = DistributedMode.SYNC @@ -94,7 +96,6 @@ class TrainerRuntimeConfig(object): if valid_strategy.a_sync and k_steps > 0: self.mode = DistributedMode.GEO - self.mode = None num_threads = os.getenv("CPU_NUM", "1") self.runtime_configs = {} @@ -161,6 +162,13 @@ def get_dist_env(): } +def get_role_id(role_maker): + try: + return role_maker._role_id() + except Exception: + return role_maker.role_id() + + def get_ps_endpoint(role_maker): try: return role_maker._get_pserver_endpoints()[get_role_id(role_maker)] @@ -184,7 +192,7 @@ def get_trainer_endpoint(role_maker): def get_previous_stage_trainers(role_maker): try: - return role_maker_get_previous_trainers() + return role_maker._get_previous_trainers() except Exception: return role_maker.get_previous_trainers() @@ -229,18 +237,11 @@ def get_sparse_tablenames(program, is_distributed): return list(tablenames) -def get_role_id(role_maker): - try: - return role_maker._role_id() - except Exception: - return role_maker.role_id() - - def get_ps_endpoints(role_maker): try: - return role_maker._get_pserver_endpoints()[get_role_id(role_maker)] + return role_maker._get_pserver_endpoints() except Exception: - return role_maker.get_pserver_endpoints()[get_role_id(role_maker)] + return role_maker.get_pserver_endpoints() def get_trainers(role_maker): @@ -296,8 +297,35 @@ def get_geo_trainer_send_context(context): if context['ps_mode'] != DistributedMode.GEO: raise ValueError("ps mode: {} not matched {}", format(ps_mode, "get_geo_trainer_send_context")) - send_ctx = {} + trainer_id = get_role_id(context['role_maker']) + idx = 0 + + distibuted_varnames = get_sparse_tablenames(context['origin_main_program'], + True) + for merged in context['merged_sparse_pairs']: + param, grad = merged + grad_name = grad.merged_var.name + param_name = param.merged_var.name + is_distributed = True if param_name in distibuted_varnames else False + + var = context['origin_main_program'].global_block().vars[ + grad.merged_var.name] + var_numel = reduce(lambda x, y: x * y, var.shape[1:]) + + sparse_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"], + [var_numel], [grad_name], trainer_id, True, + True, is_distributed, idx, False) + idx += 1 + send_ctx[sparse_ctx.var_name()] = sparse_ctx + + if len(send_ctx) == 0: + raise ValueError("GeoSGD require sparse parameters in your net.") + + if len(context['tensor_table']) > 0 and context['is_worker']: + name, ctx = _step_ctx(idx, context['role_maker']) + send_ctx[name] = ctx + return send_ctx @@ -1253,6 +1281,60 @@ def find_op_input_output(program, block, op): return input_var_list, output_var_list +def add_heter_send_op(program, heter_program, block, block_var_detail): + def _get_send_op_dict(): + send_op_dict = {} + send_op_list = find_send_op(program) + for op in send_op_list: + input_list, _ = find_op_input_output(program, + program.global_block(), op) + for var in input_list: + send_op_dict[var] = op + return send_op_dict + + send_grad_var_list = [] + send_op_dict = _get_send_op_dict() + table_dict = {} + for persistable_var in block_var_detail["backward"]["persistables"]: + if "@GRAD" not in persistable_var: + continue + if "GRAD" != persistable_var.split("@")[-1]: + continue + if persistable_var not in send_op_dict: + continue + send_op = send_op_dict[persistable_var] + is_sparse = send_op.attr('is_sparse') + table_id = send_op.attr('table_id') + send_varnames = send_op.attr('send_varnames') + send_grad_var_list.append(persistable_var) + if table_id not in table_dict: + table_dict[table_id] = {} + table_dict[table_id]['var_list'] = [] + table_dict[table_id]['is_sparse'] = is_sparse + table_dict[table_id]['send_varnames'] = send_varnames + table_dict[table_id]['var_list'].append(persistable_var) + + for table_id in table_dict: + dummy_output = block.create_var( + name=framework.generate_control_dev_var_name()) + send_input_vars = [ + block.vars[union_var] + for union_var in table_dict[table_id]['var_list'] + ] + block.append_op( + type="send", + inputs={"X": send_input_vars}, + outputs={"Out": dummy_output}, + attrs={ + "send_varnames": table_dict[table_id]['send_varnames'], + "is_sparse": is_sparse, + "table_id": table_id, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE + }) + + return send_grad_var_list + + def get_vars_name_in_block(block): vars_list = block.vars.keys() vars_name_list = [var_name for var_name in vars_list] @@ -1302,10 +1384,6 @@ def create_backward_block(program, origin_program, bp_ops_list, return heter_block -def debug_program(file, program, is_trainer): - if is_trainer: - with open(file, 'w+') as f: - f.write(str(program)) - else: - with open(file, 'w+') as f: - f.write(str(program)) +def debug_program(file, program): + with open(file, 'w+') as f: + f.write(str(program)) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index db6faa1a1b16578b95db4d81ab5bd66e5a003f75..997075590e5cf97241188b847c0c5b5036ecee59 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -71,7 +71,7 @@ from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, Scope, _Scope -from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace +from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace, CustomPlace from .incubate import fleet from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 26371d0d6ee7353f5660e55a6e381a177f378fd9..3bcefc41d2e781aa904f7ab581af3d72bc97b0d9 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -104,14 +104,14 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''): expected_type += (core.VarBase, ) # TODO(jiabin): uncomment it when we support declarative mode in eager # if _in_eager_mode(): - # expected_type += (core.eager.EagerTensor, ) + # expected_type += (core.eager.Tensor, ) elif isinstance(input, core.VarBase): raise TypeError( "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. " "Because received '{}' in {} is a imperative Variable.".format( input_name, op_name)) elif hasattr(core, "eager"): - if isinstance(input, core.eager.EagerTensor): + if isinstance(input, core.eager.Tensor): raise TypeError( "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. " "Because received '{}' in {} is a imperative Variable.".format( diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index f4ccd033aa5fc41f67d63802bc1abdc6722adb3a..706ec0d523b938fda0501dfd04f1fc976bf6a26b 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -253,7 +253,7 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): try: if in_dygraph_mode(): if _in_eager_mode(): - data = core.eager.read_next_eager_tensor_list( + data = core.eager.read_next_tensor_list( self._reader.read_next_list()[0]) else: data = self._reader.read_next_var_list() @@ -449,7 +449,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): while self._blocking_queue.size() >= len(self._places): if in_dygraph_mode(): if _in_eager_mode(): - data = core.eager.read_next_eager_tensor_list( + data = core.eager.read_next_tensor_list( self._reader.read_next_list()[0]) else: self._reader.read_next_var_list() @@ -705,7 +705,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): if in_dygraph_mode(): if _in_eager_mode(): - data = core.eager.read_next_eager_tensor_list( + data = core.eager.read_next_tensor_list( self._reader.read_next_list()[0]) else: data = self._reader.read_next_var_list() diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 9234577b8cc23a6bd2ed8986dfdcce0d21eeb3b3..8c2ff140ea4d5531a0ab6e284b1661573d9a2670 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -721,10 +721,9 @@ def to_variable(value, name=None, zero_copy=None, dtype=None): value = value.astype(dtype) if _in_eager_mode(): - return core.eager.EagerTensor(value, - framework._current_expected_place(), - False, zero_copy, name - if name else None, True) + return core.eager.Tensor(value, + framework._current_expected_place(), False, + zero_copy, name if name else None, True) else: py_var = core.VarBase( value=value, diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py index f8800f3037b408b4ad6a8b33beb1282cff185f5e..dc1095849a3d8fa5de689a518934e4dea8dff99f 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py @@ -707,6 +707,8 @@ class ProgramCache(object): def __init__(self): # {hash_id : (concrete_program, partial_layer)} self._caches = collections.OrderedDict() + # trace mostly recent used program + self._recent_key = None def _build_once(self, cache_key): concrete_program = ConcreteProgram.from_func_spec( @@ -722,6 +724,7 @@ class ProgramCache(object): raise ValueError('type(item) should be CacheKey, but received %s' % type_name(item)) item_id = hash(item) + self._recent_key = item_id if item_id not in self._caches: self._caches[item_id] = self._build_once(item) # Note: raise warnings if number of traced program is more than `max_tracing_count` @@ -749,8 +752,8 @@ class ProgramCache(object): def last(self): assert len( self._caches) >= 1, "No valid cached program in ProgramCache." - key = next(reversed(self._caches.keys())) - return key, self._caches[key] + assert self._recent_key is not None + return self._recent_key, self._caches[self._recent_key] def __len__(self): return len(self._caches) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py index 368a01de81efc5dbfc2561f8a0023e0774e12f69..98e76c0f46ffc53abd84f8682b21e0c7ae204e8e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py @@ -349,7 +349,11 @@ class StaticAnalysisVisitor(object): ret_type = {NodeVarType.type_from_annotation(node.annotation)} # if annotation and value(Constant) are diffent type, we use value type if node.value: - ret_type = self.node_to_wrapper_map[node.value].node_var_type + node_value_type = self.node_to_wrapper_map[ + node.value].node_var_type + if not (node_value_type & + {NodeVarType.UNKNOWN, NodeVarType.STATEMENT}): + ret_type = node_value_type if isinstance(node.target, gast.Name): self.node_to_wrapper_map[node.target].node_var_type = ret_type self.var_env.set_var_type(node.target.id, ret_type) diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 6a65b3bd9c6844c18ea49fd85ef61610cec1f7c2..53dbf1a66b27f35a75b44a0b6444cd8282c5278c 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -342,7 +342,7 @@ class Layer(object): import paddle import numpy as np - # the forward_post_hook change the input of the layer: input = input * 2 + # the forward_pre_hook change the input of the layer: input = input * 2 def forward_pre_hook(layer, input): # user can use layer and input for information statistis tasks diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index 64c418fabb11f6a82ca328aa74ac540480477fba..d93791a1f083a56f2f9f7b8d1c09e675c490e9e8 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -222,7 +222,7 @@ def monkey_patch_math_varbase(): # 2. create varbase for scalar lhs_dtype = self.dtype if _in_eager_mode(): - other_var_should_be = core.eager.EagerTensor + other_var_should_be = core.eager.Tensor else: other_var_should_be = core.VarBase if not isinstance(other_var, other_var_should_be): @@ -343,7 +343,7 @@ def monkey_patch_math_varbase(): if core._in_eager_mode(): local_already_patch = _already_patch_eager_tensor _already_patch_eager_tensor = True - local_tensor = core.eager.EagerTensor + local_tensor = core.eager.Tensor else: local_already_patch = _already_patch_varbase _already_patch_varbase = True diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 418b80c6ee81620ac0beb94839f869e3334626f5..e1857a34f03f514e04e83e9596c9826569e2a90d 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -2233,6 +2233,19 @@ class NCE(layers.Layer): self._inputs['Weight'] = self.weight def forward(self, input, label, sample_weight=None): + if in_dygraph_mode(): + attrs = ('num_total_classes', self._attrs['num_total_classes'], + 'num_neg_samples', self._attrs['num_neg_samples'], 'seed', + self._attrs['seed'], 'sampler', self._attrs['sampler'], + 'is_sparse', self._attrs['is_sparse'], 'remote_prefetch', + self._attrs['remote_prefetch']) + cost, _, _ = _C_ops.nce( + input, label, self.weight, self.bias, + self._inputs['SampleWeight'], self._inputs['CustomDistProbs'], + self._inputs['CustomDistAlias'], + self._inputs['CustomDistAliasProbs'], *attrs) + return cost / (self._num_neg_samples + 1) + check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE") check_variable_and_dtype(label, "label", ['int64'], "NCE") check_type(sample_weight, 'sample_weight', (Variable, type(None)), diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index f5d569828775e6bcc90ffecb3d820696bf0e56c0..6f0305f4774d6429951ee69a5b3a9db1bed18131 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -150,7 +150,7 @@ def monkey_patch_varbase(): """ if core._in_eager_mode(): - base_tensor = core.eager.EagerTensor + base_tensor = core.eager.Tensor else: base_tensor = core.VarBase assert isinstance(value, (np.ndarray, base_tensor, dict, str)), \ @@ -180,9 +180,9 @@ def monkey_patch_varbase(): "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( self.name, self_tensor_np.dtype, value_np.dtype) - # NOTE(wuweilong): self could be VarBase or EagerTensor, the subsequent behavior are defined in different files + # NOTE(wuweilong): self could be VarBase or Tensor, the subsequent behavior are defined in different files # if self is VarBase, method value() return Variable that bindded in imperative.cc, get_tensor() bindded in pybind.cc - # if self is EagerTensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc + # if self is Tensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc # this Interface behavior will be unifed in the future. self.value().get_tensor().set(value_np, framework._current_expected_place()) @@ -244,8 +244,8 @@ def monkey_patch_varbase(): if grad_tensor is not None: if core._in_eager_mode(): assert isinstance( - grad_tensor, core.eager.EagerTensor - ), "The type of grad_tensor must be paddle.Tensor" + grad_tensor, core.eager. + Tensor), "The type of grad_tensor must be paddle.Tensor" else: assert isinstance( grad_tensor, paddle. @@ -592,8 +592,8 @@ def monkey_patch_varbase(): # [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]]) """ if core._in_eager_mode(): - from paddle.tensor.to_string import eager_tensor_to_string - return eager_tensor_to_string(self) + from paddle.tensor.to_string import tensor_to_string + return tensor_to_string(self) else: from paddle.tensor.to_string import to_string return to_string(self) @@ -624,7 +624,7 @@ def monkey_patch_varbase(): "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy" ) if core._in_eager_mode(): - new_varbase = core.eager.EagerTensor() + new_varbase = core.eager.Tensor() else: new_varbase = core.VarBase() new_varbase.name = self.name + unique_name.generate("_deepcopy") @@ -808,16 +808,16 @@ def monkey_patch_varbase(): ("__getitem__", __getitem__), ("item", item), ("__setitem__", __setitem__), ("_to", _to)): if core._in_eager_mode(): - setattr(core.eager.EagerTensor, method_name, method) + setattr(core.eager.Tensor, method_name, method) else: setattr(core.VarBase, method_name, method) if core._in_eager_mode(): - setattr(core.eager.EagerTensor, "_grad_ivar", _grad_ivar) - setattr(core.eager.EagerTensor, "_set_grad_ivar", _set_grad_ivar) - setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient) - setattr(core.eager.EagerTensor, "clone", clone) - setattr(core.eager.EagerTensor, "value", value) + setattr(core.eager.Tensor, "_grad_ivar", _grad_ivar) + setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar) + setattr(core.eager.Tensor, "clear_gradient", clear_gradient) + setattr(core.eager.Tensor, "clone", clone) + setattr(core.eager.Tensor, "value", value) else: setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "grad", grad) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bb77f6031f7f99f85925cc805ee9b8ae57fc17df..780b8acc4fde67f4b47589869b258dd99a022125 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1057,7 +1057,7 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR, dtype = convert_np_dtype_to_dtype_(dtype) if _in_eager_mode(): - eager_tensor = core.eager.EagerTensor( + eager_tensor = core.eager.Tensor( dtype if dtype else core.VarDesc.VarType.FP32, list(shape) if shape else [], name, type if type else core.VarDesc.VarType.LOD_TENSOR, True @@ -1076,7 +1076,7 @@ class VariableMetaClass(type): t = type(instance) if in_dygraph_mode(): if _in_eager_mode(): - return issubclass(t, core.eager.EagerTensor) + return issubclass(t, core.eager.Tensor) return issubclass(t, core.VarBase) else: return issubclass(t, Variable) @@ -6412,7 +6412,7 @@ class ParamBase(core.VarBase): if hasattr(core, "eager"): - _core_eager_eagertensor = core.eager.EagerTensor + _core_eager_eagertensor = core.eager.Tensor else: _core_eager_eagertensor = object @@ -6918,7 +6918,7 @@ def _get_paddle_place(place): return place if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace, - core.IPUPlace, core.MLUPlace)): + core.IPUPlace, core.MLUPlace, core.CustomPlace)): return place if not isinstance(place, str): diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index 67fcd901dedc964eedad2e1720a44cfa01037574..9f54a3547d39547e3d5540981d05d862573ea214 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -85,10 +85,9 @@ class LayerHelperBase(object): assert in_dygraph_mode( ), "to_variable could only be called in dygraph mode" if _in_eager_mode(): - return core.eager.EagerTensor(value, - _current_expected_place(), False, - False, name - if name else None, True) + return core.eager.Tensor(value, + _current_expected_place(), False, + False, name if name else None, True) else: py_var = core.VarBase( value=value, diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index dde39b2dfdb6866df3bd92bba5f0c223c0a1a243..727ceca72d1f1cfc0c34dae4e516568052136ba4 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -972,7 +972,7 @@ class DygraphGeneratorLoader(DataLoaderBase): def __next__(self): try: if _in_eager_mode(): - return core.eager.read_next_eager_tensor_list( + return core.eager.read_next_tensor_list( self._reader.read_next_list()[0]) else: return self._reader.read_next_var_list() diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py index f28e99fc00d97ae13689be208bd3b10727f053ef..b186869ee9747fdc2b5c51ecc5051ab6f93f3706 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py @@ -28,7 +28,7 @@ from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer class TestPsTrainerPass(PsPassTestBase): def init(self): self.config = {} - self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml" + self.config['ps_mode_config'] = "" self.config['worker_num'] = "1" self.config['server_num'] = "1" self.config['run_minimize'] = "0" @@ -47,23 +47,58 @@ class TestPsTrainerPass(PsPassTestBase): def check(self): pass - def test_ps_optimizer_minimize_cpu(self): + def test_ps_optimizer_minimize_cpu_async(self): + self.init() + self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml" + self.config['run_minimize'] = '1' + + self.config['debug_new_minimize'] = '0' + self.config['log_dir'] = "/async_cpu_log_old_minimize" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch(self.config) + + self.config['debug_new_minimize'] = '1' + self.config['log_dir'] = "/async_cpu_log_new_minimize" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch(self.config) + + self.check() + + def test_ps_optimizer_minimize_cpu_sync(self): + self.init() + self.config['ps_mode_config'] = "../ps/cpu_sync_ps_config.yaml" + self.config['run_minimize'] = '1' + + self.config['debug_new_minimize'] = '0' + self.config['log_dir'] = "/sync_cpu_log_old_minimize" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch(self.config) + + self.config['debug_new_minimize'] = '1' + self.config['log_dir'] = "/sync_cpu_log_new_minimize" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch(self.config) + + self.check() + + def test_ps_optimizer_minimize_cpu_geo(self): self.init() + self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml" self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/cpu_log_old_minimize" + self.config['log_dir'] = "/geo_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) self.ps_launch(self.config) self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/cpu_log_new_minimize" + self.config['log_dir'] = "/geo_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) self.ps_launch(self.config) self.check() - # heter ps 三阶段待测 + # heter ps 二阶段 def test_ps_optimizer_minimize_heter(self): self.init() self.config['worker_num'] = "2" diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py index d18c691325094e10dc181ad7778a6ba1ab81a57f..67091f5fabb2ede1b589ba863c86b86607514dbb 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py @@ -214,6 +214,7 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase): self.assertTrue(np.allclose(x_data + y_data, out_1.numpy())) self.assertTrue(len(foo.program_cache) == 1) self.assertTrue(len(foo.program_cache.concrete_programs()) == 1) + first_program = foo.program_cache.last() # [16, 10] + [10] (numpy) out_2 = foo(to_variable(x_data), y_data) @@ -232,6 +233,11 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase): # create a new program self.assertTrue(len(foo.program_cache) == 2) + # test for recent program + foo(to_variable(x_data), y_data) + recent_program = foo.program_cache.last() + self.assertTrue(first_program == recent_program) + def test_get_concrete_program(self): foo = declarative(foo_func) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py index 57386bd00c9f39a9c00c6f24b79cc226bf6e27dd..567f266cd57b1eb4d16602b9bf7e1ee95d56bf19 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py @@ -306,5 +306,35 @@ class TestListInForLoopWithSubscript(TestListWithoutControlFlow): self.input = np.random.random((3, 4)).astype('float32') +class ListWithCondNet(paddle.nn.Layer): + def __init__(self): + super(ListWithCondNet, self).__init__() + + @paddle.jit.to_static + def forward(self, x, index): + y = paddle.nn.functional.relu(x) + a = [] + + for i in y: + a.append(i) + + if index > 0: + res = a[0] * a[0] + else: + res = a[-1] * a[-1] + + z = a[-1] * res + return z + + +class TestListWithCondGradInferVarType(unittest.TestCase): + def test_to_static(self): + net = ListWithCondNet() + x = paddle.to_tensor([2, 3, 4], dtype='float32') + index = paddle.to_tensor([1]) + res = net(x, index) + self.assertEqual(res[0], 16.) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py index eb545e5ca26add0be3f61a6025833ddc8b376012..388291a51c22f4fa52fda5b99f30fb879df93447 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py @@ -147,6 +147,7 @@ result_var_type6 = { def func_to_test7(a: int, b: float, c: paddle.Tensor, d: float='diff'): a = True e, f = paddle.shape(c) + g: paddle.Tensor = len(c) result_var_type7 = { @@ -155,7 +156,8 @@ result_var_type7 = { 'c': {NodeVarType.TENSOR}, 'd': {NodeVarType.STRING}, 'e': {NodeVarType.PADDLE_RETURN_TYPES}, - 'f': {NodeVarType.PADDLE_RETURN_TYPES} + 'f': {NodeVarType.PADDLE_RETURN_TYPES}, + 'g': {NodeVarType.TENSOR} } test_funcs = [ diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py index 838678b1c8449b6136dda00dcb3a70c03b3e9c16..e79b33d88d3f18a180b0e376131ce62e56726e4a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py @@ -98,10 +98,20 @@ class TrtConvertGeluTest(TrtLayerAutoScanTest): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['approximate'] == True or self.dims == 1: + valid_version = (7, 0, 0) + compile_version = paddle_infer.get_trt_compile_version() + runtime_version = paddle_infer.get_trt_runtime_version() + self.assertTrue(compile_version == runtime_version) + # Dimension one only runs on Paddle OP + if self.dims == 1: return 0, 3 - else: + if compile_version >= valid_version: return 1, 2 + else: + if attrs[0]['approximate'] == True: + return 0, 3 + else: + return 1, 2 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py index ddb96c37db780c214d40e8afca7348cff935ce6c..89ce1145d74e01c32e155495fcb4212bed78ab84 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py @@ -52,7 +52,7 @@ class TrtConvertPool2dTest(TrtLayerAutoScanTest): return np.random.random([24, 3, 3, 3]).astype(np.float32) for strides in [[1, 1], [1, 2], [2, 2]]: - for paddings in [[0, 2], [0, 3], [0, 1, 2, 3]]: + for paddings in [[0, 2], [0, 3]]: for pooling_type in ['max', 'avg']: for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']: for ksize in [[2, 3], [3, 3]]: @@ -145,44 +145,18 @@ class TrtConvertPool2dTest(TrtLayerAutoScanTest): True), 1e-5 def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if len(program_config.ops[0].attrs['paddings']) == 4: - return True - return False - - self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "4-dims paddings are not support for trt now.") - - def teller2(program_config, predictor_config): - if program_config.ops[0].attrs['global_pooling'] == True: - return True - return False - - self.add_skip_case( - teller2, SkipReasons.TRT_NOT_IMPLEMENTED, - "It is not support that global_pooling is true for trt now.") - - def teller3(program_config, predictor_config): - if self.dynamic_shape.min_input_shape == {} and program_config.ops[ - 0].attrs['ceil_mode'] == True: - return True - return False - - self.add_skip_case( - teller3, SkipReasons.TRT_NOT_IMPLEMENTED, - "It is not support that ceil_mode is true in static mode for trt now." - ) - - def teller4(program_config, predictor_config): - if self.dynamic_shape.min_input_shape != {} and ( - program_config.ops[0].attrs['strides'] == [1, 2] or - program_config.ops[0].attrs['strides'] == [2, 2]): + def teller(program_config, predictor_config): + if program_config.ops[0].attrs['pooling_type'] == 'avg' and \ + program_config.ops[0].attrs['global_pooling'] == False and \ + program_config.ops[0].attrs['exclusive'] == True and \ + program_config.ops[0].attrs['adaptive'] == False and \ + program_config.ops[0].attrs['ceil_mode'] == True: return True return False self.add_skip_case( - teller4, SkipReasons.TRT_NOT_IMPLEMENTED, - "It is not support that strides is not equal [1, 1] in dynamic mode for trt now." + teller, SkipReasons.TRT_NOT_IMPLEMENTED, + "The results of some cases are Nan, but the results of TensorRT and GPU are the same." ) def test(self): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py index 85bd625413c86d995cdfb515c49c3a87af237a6c..1bcbbc38c9762cb19b9b8b01ac8e1728b11d38e0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py @@ -28,25 +28,14 @@ class TRTGroupNormTest(InferencePassTest): with fluid.program_guard(self.main_program, self.startup_program): data = fluid.data( name="data", shape=[-1, 512, 12, 12], dtype="float32") - relu_out = fluid.layers.relu(data) - relu6_out = fluid.layers.relu6(relu_out) - tanh_out = fluid.layers.tanh(relu6_out) - conv_out = fluid.layers.conv2d( - input=tanh_out, - num_filters=512, - filter_size=3, - groups=1, - padding=[1, 1], - bias_attr=False, - act=None) - out = self.append_group_norm(conv_out) + out = self.append_group_norm(data) self.feeds = { "data": np.random.random([1, 512, 12, 12]).astype("float32"), } self.enable_trt = True self.trt_parameters = TRTGroupNormTest.TensorRTParam( - 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False) + 1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False) self.dynamic_shape_params = TRTGroupNormTest.DynamicShapeParam({ 'data': [1, 512, 12, 12] }, {'data': [1, 512, 12, 12]}, {'data': [1, 512, 12, 12]}, False) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py index 26ad45db7a18d6b51150ebe988bd47473fd01c40..d71937f986e515bfffb0713fe11e8547a648628d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py @@ -119,6 +119,17 @@ class TensorRTAvgPoolTest(TensorRTPoolTest): self.exclusive = False +class TensorRTAvgCeilPoolTest(TensorRTPoolTest): + def set_extra_config(self): + self.pool_size = 2 + self.pool_type = 'avg' + self.pool_stride = 1 + self.pool_padding = 0 + self.global_pooling = False + self.ceil_mode = True + self.exclusive = False + + class TensorRTGlobalPoolTest(TensorRTPoolTest): def set_extra_config(self): self.pool_size = 2 diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py index 2d9703117671cb4d994923695a6944061bf99838..fd442c6205e98d26b4797ff2ef4499b376bc8bdd 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py @@ -25,7 +25,125 @@ from paddle.fluid import Program, program_guard import sys sys.path.append('..') from op_test import OpTest -from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive +from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index + + +def pool2d_backward_navie(x, + ksize, + strides, + paddings, + global_pool=0, + ceil_mode=False, + exclusive=True, + adaptive=False, + data_format='NCHW', + pool_type="max", + padding_algorithm="EXPLICIT"): + # update paddings + def _get_padding_with_SAME(input_shape, pool_size, pool_stride): + padding = [] + for input_size, filter_size, stride_size in zip(input_shape, pool_size, + pool_stride): + out_size = int((input_size + stride_size - 1) / stride_size) + pad_sum = np.max(( + (out_size - 1) * stride_size + filter_size - input_size, 0)) + pad_0 = int(pad_sum / 2) + pad_1 = int(pad_sum - pad_0) + padding.append(pad_0) + padding.append(pad_1) + return padding + + if isinstance(padding_algorithm, str): + padding_algorithm = padding_algorithm.upper() + if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: + raise ValueError("Unknown Attr(padding_algorithm): '%s'. " + "It can only be 'SAME' or 'VALID'." % + str(padding_algorithm)) + + if padding_algorithm == "VALID": + paddings = [0, 0, 0, 0] + if ceil_mode != False: + raise ValueError( + "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)" + " must be False. " + "Received ceil_mode: True.") + elif padding_algorithm == "SAME": + input_data_shape = [] + if data_format == "NCHW": + input_data_shape = x.shape[2:4] + elif data_format == "NHWC": + input_data_shape = x.shape[1:3] + paddings = _get_padding_with_SAME(input_data_shape, ksize, strides) + + assert len(paddings) == 2 or len(paddings) == 4 + is_sys = True if len(paddings) == 2 else False + + if data_format == "NHWC": + x = x.transpose([0, 3, 1, 2]) + + N, C, H, W = x.shape + + if global_pool == 1: + ksize = [H, W] + paddings = [0 for _ in range(len(paddings))] + + pad_h_up = paddings[0] if is_sys else paddings[0] + pad_h_down = paddings[0] if is_sys else paddings[1] + pad_w_left = paddings[1] if is_sys else paddings[2] + pad_w_right = paddings[1] if is_sys else paddings[3] + + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \ + if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1 + W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \ + if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1 + + x_grad = np.zeros_like(x) + for i in range(H_out): + if adaptive: + in_h_start = adaptive_start_index(i, H, ksize[0]) + in_h_end = adaptive_end_index(i, H, ksize[0]) + else: + in_h_start = np.max((i * strides[0] - pad_h_up, 0)) + in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H)) + + for j in range(W_out): + if adaptive: + in_w_start = adaptive_start_index(j, W, ksize[1]) + in_w_end = adaptive_end_index(j, W, ksize[1]) + else: + in_h_start = i * strides[0] - pad_h_up + in_w_start = j * strides[1] - pad_w_left + in_h_end = i * strides[0] + ksize[0] - pad_h_up + in_w_end = j * strides[1] + ksize[1] - pad_w_left + + field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start) + in_h_start = np.max((in_h_start, 0)) + in_w_start = np.max((in_w_start, 0)) + in_h_end = np.min((in_h_end, H)) + in_w_end = np.min((in_w_end, W)) + + if pool_type == 'avg': + if (exclusive or adaptive): + field_size = (in_h_end - in_h_start) * ( + in_w_end - in_w_start) + x_grad[:, :, in_h_start:in_h_end, in_w_start: + in_w_end] += 1 / field_size + elif pool_type == 'max': + for n in range(N): + for c in range(C): + idx = np.argmax(x[n, c, in_h_start:in_h_end, in_w_start: + in_w_end].flatten()) + idx_h = idx // (in_w_end - in_w_start) + idx_w = idx % (in_w_end - in_w_start) + x_grad[n, c, in_h_start + idx_h, in_w_start + + idx_w] += 1 + + if data_format == "NHWC": + x_grad = x_grad.transpose([0, 2, 3, 1]) + return x_grad class TestPool2D_Op_Mixin(object): @@ -71,12 +189,25 @@ class TestPool2D_Op_Mixin(object): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return - - if self.pool_type != "max": - self.check_grad_with_place( - self.place, set(['X']), 'Out', max_relative_error=0.07) + x_grad = pool2d_backward_navie( + self.inputs["X"], + ksize=self.ksize, + strides=self.strides, + paddings=self.paddings, + global_pool=self.global_pool, + ceil_mode=False, + exclusive=self.exclusive, + adaptive=self.adaptive, + data_format=self.data_format, + pool_type=self.pool_type, + padding_algorithm=self.padding_algorithm) + x_grad = x_grad / np.prod(self.outputs['Out'].shape) + self.check_grad_with_place( + self.place, + set(['X']), + 'Out', + max_relative_error=0.06, + user_defined_grads=[x_grad]) def init_data_format(self): self.data_format = "NCHW" @@ -108,7 +239,6 @@ class TestPool2D_Op_Mixin(object): def init_exclusive(self): self.exclusive = True - # Not support adaptive pooling currently def init_adaptive(self): self.adaptive = False @@ -173,7 +303,7 @@ class TestCase5(TestCase2): self.pool2D_forward_naive = max_pool2D_forward_naive -def create_test_fp16_class(parent, check_grad=True): +def create_test_fp16_class(parent): class TestFp16Case(parent): def init_data_type(self): self.dtype = np.float16 @@ -182,19 +312,13 @@ def create_test_fp16_class(parent, check_grad=True): place = core.MLUPlace(0) self.check_output_with_place(place, atol=1e-3) - def test_check_grad(self): - place = core.MLUPlace(0) - if self.pool_type != "max" and check_grad: - self.check_grad_with_place( - place, set(['X']), 'Out', max_relative_error=0.07) - cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") TestFp16Case.__name__ = cls_name globals()[cls_name] = TestFp16Case create_test_fp16_class(TestPool2D_Op) -create_test_fp16_class(TestCase1, check_grad=False) +create_test_fp16_class(TestCase1) create_test_fp16_class(TestCase2) create_test_fp16_class(TestCase3) create_test_fp16_class(TestCase4) @@ -222,6 +346,24 @@ class TestAvgInclude(TestCase2): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + +class TestAvgPoolAdaptiveAsyOutSize(TestCase1): + def init_adaptive(self): + self.adaptive = True + + def init_shape(self): + self.shape = [8, 3, 6, 6] + + def init_test_case(self): + self.ksize = [2, 3] + self.strides = [1, 1] + self.paddings = [0, 0, 0, 0] + + #-------test pool2d with asymmetric padding----- @@ -302,6 +444,19 @@ class TestAvgInclude_AsyPadding(TestCase2): self.shape = [2, 3, 7, 7] +class TestAvgPoolAdaptive_AsyPadding(TestCase1): + def init_adaptive(self): + self.adaptive = True + + def init_test_case(self): + self.ksize = [3, 3] + self.strides = [1, 1] + self.paddings = [1, 1, 0, 2] + + def init_shape(self): + self.shape = [2, 3, 7, 7] + + #----------- test channel_last -------------- class TestPool2D_channel_last(TestPool2D_Op): def init_data_format(self): @@ -359,14 +514,6 @@ class TestCase5_Max(TestCase2): def init_pool_type(self): self.pool_type = "max" - def test_check_grad(self): - if self.dtype == np.float16: - return - place = core.MLUPlace(0) - if self.pool_type == "max": - self.check_grad_with_place( - place, set(['X']), 'Out', max_relative_error=1.00) - class TestCase5_channel_last_Max(TestCase5_Max): def init_data_format(self): @@ -381,6 +528,11 @@ class TestAvgInclude_channel_last(TestCase2_channel_last): self.exclusive = False +class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last): + def init_adaptive(self): + self.adaptive = True + + class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding): def init_data_format(self): self.data_format = "NHWC" diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py index 882043ef6eb911f6163d516e9929658f38810ade..23ca0cf1f492fade05a81f0de1d6bc262458675c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -71,7 +71,7 @@ class TestMatMulV2Op(OpTest): self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') -class TestMatMuklOp2(TestMatMulV2Op): +class TestMatMulOp2(TestMatMulV2Op): """ case 2 """ @@ -83,7 +83,7 @@ class TestMatMuklOp2(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOp3(TestMatMulV2Op): +class TestMatMulOp3(TestMatMulV2Op): """ case 3 """ @@ -95,7 +95,7 @@ class TestMatMuklOp3(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp4(TestMatMulV2Op): +class TestMatMulOp4(TestMatMulV2Op): """ case 4 """ @@ -107,7 +107,7 @@ class TestMatMuklOp4(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp5(TestMatMulV2Op): +class TestMatMulOp5(TestMatMulV2Op): """ case 5 """ @@ -119,7 +119,7 @@ class TestMatMuklOp5(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp6(TestMatMulV2Op): +class TestMatMulOp6(TestMatMulV2Op): """ case 6 """ @@ -131,7 +131,7 @@ class TestMatMuklOp6(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp7(TestMatMulV2Op): +class TestMatMulOp7(TestMatMulV2Op): """ case 7 """ @@ -143,7 +143,7 @@ class TestMatMuklOp7(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp8(TestMatMulV2Op): +class TestMatMulOp8(TestMatMulV2Op): """ case 8 """ @@ -155,7 +155,7 @@ class TestMatMuklOp8(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp9(TestMatMulV2Op): +class TestMatMulOp9(TestMatMulV2Op): """ case 9 """ @@ -167,7 +167,7 @@ class TestMatMuklOp9(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOp10(TestMatMulV2Op): +class TestMatMulOp10(TestMatMulV2Op): """ case 10 """ @@ -179,7 +179,7 @@ class TestMatMuklOp10(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp11(TestMatMulV2Op): +class TestMatMulOp11(TestMatMulV2Op): """ case 11 """ @@ -191,7 +191,7 @@ class TestMatMuklOp11(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp12(TestMatMulV2Op): +class TestMatMulOp12(TestMatMulV2Op): """ case 12 """ @@ -203,7 +203,7 @@ class TestMatMuklOp12(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp13(TestMatMulV2Op): +class TestMatMulOp13(TestMatMulV2Op): """ case 13 """ @@ -215,7 +215,7 @@ class TestMatMuklOp13(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp14(TestMatMulV2Op): +class TestMatMulOp14(TestMatMulV2Op): """ case 14_1 """ @@ -227,7 +227,7 @@ class TestMatMuklOp14(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp15(TestMatMulV2Op): +class TestMatMulOp15(TestMatMulV2Op): """ case 14_2 """ @@ -239,7 +239,7 @@ class TestMatMuklOp15(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp16(TestMatMulV2Op): +class TestMatMulOp16(TestMatMulV2Op): """ case 16 : to check the gradient for special case """ @@ -251,7 +251,7 @@ class TestMatMuklOp16(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp17(TestMatMulV2Op): +class TestMatMulOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case """ @@ -263,7 +263,7 @@ class TestMatMuklOp17(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOpBroadcast1(TestMatMulV2Op): +class TestMatMulOpBroadcast1(TestMatMulV2Op): """ case 14_3 """ @@ -275,7 +275,7 @@ class TestMatMuklOpBroadcast1(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOpBroadcast2(TestMatMulV2Op): +class TestMatMulOpBroadcast2(TestMatMulV2Op): """ case 14_4 """ @@ -310,22 +310,22 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): create_test_fp16_class(TestMatMulV2Op) -create_test_fp16_class(TestMatMuklOp2) -create_test_fp16_class(TestMatMuklOp3) -create_test_fp16_class(TestMatMuklOp4) -create_test_fp16_class(TestMatMuklOp5) -create_test_fp16_class(TestMatMuklOp6) -create_test_fp16_class(TestMatMuklOp7) -create_test_fp16_class(TestMatMuklOp8) -create_test_fp16_class(TestMatMuklOp9) -create_test_fp16_class(TestMatMuklOp10) -create_test_fp16_class(TestMatMuklOp11) -create_test_fp16_class(TestMatMuklOp12) -create_test_fp16_class(TestMatMuklOp13) -create_test_fp16_class(TestMatMuklOp14) -create_test_fp16_class(TestMatMuklOp15) -create_test_fp16_class(TestMatMuklOp16) -create_test_fp16_class(TestMatMuklOp17) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp7) +create_test_fp16_class(TestMatMulOp8) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) +create_test_fp16_class(TestMatMulOp14) +create_test_fp16_class(TestMatMulOp15) +create_test_fp16_class(TestMatMulOp16) +create_test_fp16_class(TestMatMulOp17) class TestMatMulV2API(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 754d7bd54b9f817d73c2f5d705026c9a468f4008..85423df3d382831738c2c64ea845d0661f9cdbb7 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1658,7 +1658,7 @@ class OpTest(unittest.TestCase): for grad in analytic_grads: if grad.dtype == np.uint16: grad = convert_uint16_to_float(grad) - max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error + max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error fp32_analytic_grads.append(grad) analytic_grads = fp32_analytic_grads @@ -1666,7 +1666,7 @@ class OpTest(unittest.TestCase): for grad in numeric_grads: if grad.dtype == np.uint16: grad = convert_uint16_to_float(grad) - max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error + max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error fp32_numeric_grads.append(grad) numeric_grads = fp32_numeric_grads diff --git a/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml b/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml index 669709ea5607e5def58204637202e839090f1197..93a13a67ce6b519fa3c1b544f05ff59072a9f77d 100755 --- a/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml +++ b/python/paddle/fluid/tests/unittests/ps/cpu_async_ps_config.yaml @@ -26,7 +26,6 @@ hyper_parameters: fc_sizes: [400, 400, 400] runner: - geo_step: 400 sync_mode: "async" # sync / async / geo / heter thread_num: 16 use_gpu: 0 diff --git a/paddle/scripts/get_pten_kernel_function.sh b/python/paddle/fluid/tests/unittests/ps/cpu_geo_ps_config.yaml old mode 100755 new mode 100644 similarity index 51% rename from paddle/scripts/get_pten_kernel_function.sh rename to python/paddle/fluid/tests/unittests/ps/cpu_geo_ps_config.yaml index 6ae2f1b679e3eafcef5c20376ecd82784d61d4e0..80125ae6c37faa469469f5f67bd9b8796fd079f2 --- a/paddle/scripts/get_pten_kernel_function.sh +++ b/python/paddle/fluid/tests/unittests/ps/cpu_geo_ps_config.yaml @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,15 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -#================================================= -# Utils -#================================================= - -set -e +# refer to PaddleRec/models/rank/dnn/benchmark.yaml -EXIT_CODE=0; -tmp_dir=`mktemp -d` +hyper_parameters: + optimizer: + class: Adam + learning_rate: 0.0001 + adam_lazy_mode: True + sparse_inputs_slots: 27 + sparse_feature_number: 1000001 + sparse_feature_dim: 10 + dense_input_dim: 13 + fc_sizes: [400, 400, 400] -PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" +runner: + geo_step: 400 + sync_mode: "geo" + thread_num: 16 + use_gpu: 0 + + model_path: "../ps_dnn_model.py" -unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/pten/kernels -name "*.c*" | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' | grep PT_REGISTER | awk -F ",|\(" '{gsub(/ /,"");print $2, $3, $4, $5}' | sort -u | awk '{gsub(/pten::/,"");print $0}' | grep -v "_grad" + diff --git a/python/paddle/fluid/tests/unittests/ps/cpu_sync_ps_config.yaml b/python/paddle/fluid/tests/unittests/ps/cpu_sync_ps_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95685a488cade1219290956dab8339ee641e001f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ps/cpu_sync_ps_config.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# refer to PaddleRec/models/rank/dnn/benchmark.yaml + +hyper_parameters: + optimizer: + class: Adam + learning_rate: 0.0001 + adam_lazy_mode: True + sparse_inputs_slots: 27 + sparse_feature_number: 1000001 + sparse_feature_dim: 10 + dense_input_dim: 13 + fc_sizes: [400, 400, 400] + +runner: + sync_mode: "sync" + thread_num: 16 + use_gpu: 0 + + model_path: "../ps_dnn_model.py" + + diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py index 8f8ff65af544a1c4ddb4f1548603b418d3bf8bed..d08c1d41c89ec532f6c3124000f9bec38f9b86d7 100755 --- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py +++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py @@ -329,9 +329,9 @@ class DnnTrainer(object): sync_mode = self.config.get("runner.sync_mode") inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) + self.role_maker._generate_role() # 必要 if self.config['debug_new_minimize'] == 1: logger.info("entering run_minimize -- new") - self.role_maker._generate_role() # 必要 from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, @@ -346,11 +346,16 @@ class DnnTrainer(object): if fleet.is_server(): _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_server_main.prototxt' - debug_program(_main_file, loss.block.program, 0) + debug_program(_main_file, loss.block.program) elif fleet.is_worker(): _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_worker_main.prototxt' - debug_program(_main_file, loss.block.program, 1) + debug_program(_main_file, loss.block.program) + elif self.role_maker._is_heter_worker(): + _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + self.config[ + 'debug_new_minimize']) + '_heter_worker_main.prototxt' + debug_program(_main_file, loss.block.program) def run_single_pass(self): self.init_fleet_with_gloo() @@ -395,17 +400,18 @@ class DnnTrainer(object): _main_file = '/' + sync_mode + "_" + str(config[ "applied_pass_name"]) + '_debug:_' + str(self.config[ 'debug_new_pass']) + '_server_main.prototxt' - debug_program(_main_file, _main, 0) + debug_program(_main_file, _main) elif fleet.is_worker(): _main_file = '/' + sync_mode + "_" + str(config[ "applied_pass_name"]) + '_debug:_' + str(self.config[ 'debug_new_pass']) + '_worker_main.prototxt' - debug_program(_main_file, _main, 1) + debug_program(_main_file, _main) if __name__ == "__main__": paddle.enable_static() config = parse_args() + logger.info(">>>>>>>>>> python process started") os.environ["CPU_NUM"] = str(config.get("runner.thread_num")) benchmark_main = DnnTrainer(config) if config['run_single_pass'] == 1: diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index d6bf768bee7744524d33082b2cda81ea4870e534..252482fa6d270edbc1bec3a0d6023933521d7f7e 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -109,26 +109,26 @@ class EagerDtypeTestCase(unittest.TestCase): core.VarDesc.VarType.COMPLEX128) -class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): +class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): def constructor(self, place): - egr_tensor = core.eager.EagerTensor() + egr_tensor = core.eager.Tensor() self.assertEqual(egr_tensor.persistable, False) self.assertTrue("generated" in egr_tensor.name) self.assertEqual(egr_tensor.shape, []) self.assertEqual(egr_tensor.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor.stop_gradient, True) - egr_tensor0 = core.eager.EagerTensor( - core.VarDesc.VarType.FP32, [4, 16, 16, 32], "test_eager_tensor", - core.VarDesc.VarType.LOD_TENSOR, True) + egr_tensor0 = core.eager.Tensor(core.VarDesc.VarType.FP32, + [4, 16, 16, 32], "test_eager_tensor", + core.VarDesc.VarType.LOD_TENSOR, True) self.assertEqual(egr_tensor0.persistable, True) self.assertEqual(egr_tensor0.name, "test_eager_tensor") self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32]) self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP32) arr0 = np.random.rand(4, 16, 16, 32).astype('float32') - egr_tensor1 = core.eager.EagerTensor(arr0, place, True, False, - "numpy_tensor1", False) + egr_tensor1 = core.eager.Tensor(arr0, place, True, False, + "numpy_tensor1", False) self.assertEqual(egr_tensor1.persistable, True) self.assertEqual(egr_tensor1.name, "numpy_tensor1") self.assertEqual(egr_tensor1.shape, [4, 16, 16, 32]) @@ -138,8 +138,8 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(np.array_equal(egr_tensor1.numpy(), arr0)) arr1 = np.random.randint(100, size=(4, 16, 16, 32), dtype=np.int64) - egr_tensor2 = core.eager.EagerTensor(arr1, place, False, True, - "numpy_tensor2", True) + egr_tensor2 = core.eager.Tensor(arr1, place, False, True, + "numpy_tensor2", True) self.assertEqual(egr_tensor2.persistable, False) self.assertEqual(egr_tensor2.name, "numpy_tensor2") self.assertEqual(egr_tensor2.shape, [4, 16, 16, 32]) @@ -149,7 +149,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(np.array_equal(egr_tensor2.numpy(), arr1)) arr2 = np.random.rand(4, 16, 16, 32, 64).astype('float32') - egr_tensor3 = core.eager.EagerTensor(arr2) + egr_tensor3 = core.eager.Tensor(arr2) self.assertEqual(egr_tensor3.persistable, False) self.assertTrue("generated_tensor" in egr_tensor3.name) self.assertEqual(egr_tensor3.shape, [4, 16, 16, 32, 64]) @@ -161,7 +161,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(np.array_equal(egr_tensor3.numpy(), arr2)) egr_tensor3.stop_gradient = False - egr_tensor4 = core.eager.EagerTensor(egr_tensor3) + egr_tensor4 = core.eager.Tensor(egr_tensor3) self.assertEqual(egr_tensor4.persistable, False) self.assertTrue("generated_tensor" in egr_tensor4.name) self.assertEqual(egr_tensor4.shape, egr_tensor3.shape) @@ -174,7 +174,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): np.array_equal(egr_tensor4.numpy(), egr_tensor3.numpy())) arr4 = np.random.rand(4, 16, 16, 32).astype('float32') - egr_tensor5 = core.eager.EagerTensor(arr4, place) + egr_tensor5 = core.eager.Tensor(arr4, place) self.assertEqual(egr_tensor5.persistable, False) self.assertTrue("generated_tensor" in egr_tensor5.name) self.assertEqual(egr_tensor5.shape, [4, 16, 16, 32]) @@ -183,7 +183,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor5.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor5.numpy(), arr4)) - egr_tensor6 = core.eager.EagerTensor(egr_tensor5, core.CPUPlace()) + egr_tensor6 = core.eager.Tensor(egr_tensor5, core.CPUPlace()) self.assertEqual(egr_tensor6.persistable, False) self.assertTrue("generated_tensor" in egr_tensor6.name) self.assertEqual(egr_tensor6.shape, [4, 16, 16, 32]) @@ -193,7 +193,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue( np.array_equal(egr_tensor6.numpy(), egr_tensor5.numpy())) - egr_tensor7 = core.eager.EagerTensor(arr4, place, True) + egr_tensor7 = core.eager.Tensor(arr4, place, True) self.assertEqual(egr_tensor7.persistable, True) self.assertTrue("generated_tensor" in egr_tensor7.name) self.assertEqual(egr_tensor7.shape, [4, 16, 16, 32]) @@ -202,7 +202,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor7.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor7.numpy(), arr4)) - egr_tensor8 = core.eager.EagerTensor(egr_tensor6, place, "egr_tensor8") + egr_tensor8 = core.eager.Tensor(egr_tensor6, place, "egr_tensor8") self.assertEqual(egr_tensor8.persistable, False) self.assertEqual(egr_tensor8.name, "egr_tensor8") self.assertEqual(egr_tensor8.shape, [4, 16, 16, 32]) @@ -212,7 +212,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue( np.array_equal(egr_tensor8.numpy(), egr_tensor5.numpy())) - egr_tensor9 = core.eager.EagerTensor(arr4, place, True, True) + egr_tensor9 = core.eager.Tensor(arr4, place, True, True) self.assertEqual(egr_tensor9.persistable, True) self.assertTrue("generated_tensor" in egr_tensor9.name) self.assertEqual(egr_tensor9.shape, [4, 16, 16, 32]) @@ -224,7 +224,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): x = np.random.rand(3, 3).astype('float32') t = paddle.fluid.Tensor() t.set(x, paddle.fluid.CPUPlace()) - egr_tensor10 = core.eager.EagerTensor(t, place) + egr_tensor10 = core.eager.Tensor(t, place) self.assertEqual(egr_tensor10.persistable, False) self.assertTrue("generated_tensor" in egr_tensor10.name) self.assertEqual(egr_tensor10.shape, [3, 3]) @@ -233,7 +233,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor10.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor10.numpy(), x)) - egr_tensor11 = core.eager.EagerTensor(t, place, "framework_constructed") + egr_tensor11 = core.eager.Tensor(t, place, "framework_constructed") self.assertEqual(egr_tensor11.persistable, False) self.assertTrue("framework_constructed" in egr_tensor11.name) self.assertEqual(egr_tensor11.shape, [3, 3]) @@ -242,7 +242,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor11.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor11.numpy(), x)) - egr_tensor12 = core.eager.EagerTensor(t) + egr_tensor12 = core.eager.Tensor(t) self.assertEqual(egr_tensor12.persistable, False) self.assertTrue("generated_tensor" in egr_tensor12.name) self.assertEqual(egr_tensor12.shape, [3, 3]) @@ -290,10 +290,10 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.constructor(p) def constructor_with_kwargs(self, place): - # init EagerTensor by Python array + # init Tensor by Python array arr = np.random.rand(4, 16, 16, 32).astype('float32') - egr_tensor0 = core.eager.EagerTensor(value=arr) + egr_tensor0 = core.eager.Tensor(value=arr) self.assertEqual(egr_tensor0.persistable, False) self.assertTrue("generated" in egr_tensor0.name) self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32]) @@ -303,7 +303,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor0.stop_gradient, True) - egr_tensor1 = core.eager.EagerTensor(value=arr, place=place) + egr_tensor1 = core.eager.Tensor(value=arr, place=place) self.assertEqual(egr_tensor1.persistable, False) self.assertTrue("generated" in egr_tensor1.name) self.assertEqual(egr_tensor1.shape, [4, 16, 16, 32]) @@ -311,7 +311,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor1.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor1.stop_gradient, True) - egr_tensor2 = core.eager.EagerTensor(arr, place=place) + egr_tensor2 = core.eager.Tensor(arr, place=place) self.assertEqual(egr_tensor2.persistable, False) self.assertTrue("generated" in egr_tensor2.name) self.assertEqual(egr_tensor2.shape, [4, 16, 16, 32]) @@ -319,7 +319,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor2.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor2.stop_gradient, True) - egr_tensor3 = core.eager.EagerTensor( + egr_tensor3 = core.eager.Tensor( arr, place=place, name="new_eager_tensor") self.assertEqual(egr_tensor3.persistable, False) self.assertTrue("new_eager_tensor" in egr_tensor3.name) @@ -328,7 +328,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor3.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor3.stop_gradient, True) - egr_tensor4 = core.eager.EagerTensor( + egr_tensor4 = core.eager.Tensor( arr, place=place, persistable=True, name="new_eager_tensor") self.assertEqual(egr_tensor4.persistable, True) self.assertTrue("new_eager_tensor" in egr_tensor4.name) @@ -337,7 +337,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor4.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor4.stop_gradient, True) - egr_tensor5 = core.eager.EagerTensor( + egr_tensor5 = core.eager.Tensor( arr, core.CPUPlace(), persistable=True, @@ -350,7 +350,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor5.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor5.stop_gradient, True) - egr_tensor6 = core.eager.EagerTensor( + egr_tensor6 = core.eager.Tensor( arr, place=core.CPUPlace(), persistable=True, @@ -363,7 +363,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor6.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor6.stop_gradient, True) - egr_tensor7 = core.eager.EagerTensor( + egr_tensor7 = core.eager.Tensor( arr, place=place, persistable=True, @@ -376,7 +376,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor7.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor7.stop_gradient, True) - egr_tensor8 = core.eager.EagerTensor( + egr_tensor8 = core.eager.Tensor( arr, place=place, persistable=True, @@ -390,7 +390,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor8.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor8.stop_gradient, False) - egr_tensor9 = core.eager.EagerTensor( + egr_tensor9 = core.eager.Tensor( arr, place, True, True, "new_eager_tensor", stop_gradient=False) self.assertEqual(egr_tensor9.persistable, True) self.assertTrue("new_eager_tensor" in egr_tensor9.name) @@ -399,7 +399,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor9.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor9.stop_gradient, False) - egr_tensor10 = core.eager.EagerTensor( + egr_tensor10 = core.eager.Tensor( arr, place, True, @@ -413,7 +413,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor10.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor10.stop_gradient, False) - egr_tensor11 = core.eager.EagerTensor( + egr_tensor11 = core.eager.Tensor( arr, place, True, @@ -427,7 +427,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor11.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor11.stop_gradient, False) - egr_tensor12 = core.eager.EagerTensor( + egr_tensor12 = core.eager.Tensor( arr, place, persistable=True, @@ -441,7 +441,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor12.stop_gradient, False) - egr_tensor13 = core.eager.EagerTensor( + egr_tensor13 = core.eager.Tensor( value=arr, place=place, persistable=True, @@ -456,7 +456,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor13.stop_gradient, False) # special case - egr_tensor14 = core.eager.EagerTensor( + egr_tensor14 = core.eager.Tensor( dtype=core.VarDesc.VarType.FP32, dims=[4, 16, 16, 32], name="special_eager_tensor", @@ -467,8 +467,8 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertEqual(egr_tensor14.shape, [4, 16, 16, 32]) self.assertEqual(egr_tensor14.dtype, core.VarDesc.VarType.FP32) - # init EagerTensor by EagerTensor - egr_tensor15 = core.eager.EagerTensor(value=egr_tensor4) + # init Tensor by Tensor + egr_tensor15 = core.eager.Tensor(value=egr_tensor4) self.assertEqual(egr_tensor15.persistable, True) self.assertTrue("generated" in egr_tensor15.name) self.assertEqual(egr_tensor15.shape, egr_tensor4.shape) @@ -480,7 +480,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue( np.array_equal(egr_tensor15.numpy(), egr_tensor4.numpy())) - egr_tensor16 = core.eager.EagerTensor( + egr_tensor16 = core.eager.Tensor( value=egr_tensor4, name="new_eager_tensor") self.assertEqual(egr_tensor16.persistable, True) self.assertTrue("new_eager_tensor" in egr_tensor16.name) @@ -493,7 +493,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue( np.array_equal(egr_tensor16.numpy(), egr_tensor4.numpy())) - egr_tensor17 = core.eager.EagerTensor( + egr_tensor17 = core.eager.Tensor( value=egr_tensor4, place=place, name="new_eager_tensor", ) @@ -506,7 +506,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue( np.array_equal(egr_tensor17.numpy(), egr_tensor4.numpy())) - egr_tensor18 = core.eager.EagerTensor( + egr_tensor18 = core.eager.Tensor( egr_tensor4, place=place, name="new_eager_tensor", ) @@ -519,7 +519,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue( np.array_equal(egr_tensor18.numpy(), egr_tensor4.numpy())) - egr_tensor19 = core.eager.EagerTensor( + egr_tensor19 = core.eager.Tensor( egr_tensor4, place, name="new_eager_tensor", ) @@ -536,7 +536,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): x = np.random.rand(3, 3).astype('float32') t = paddle.fluid.Tensor() t.set(x, paddle.fluid.CPUPlace()) - egr_tensor20 = core.eager.EagerTensor(value=t) + egr_tensor20 = core.eager.Tensor(value=t) self.assertEqual(egr_tensor20.persistable, False) self.assertTrue("generated_tensor" in egr_tensor20.name) self.assertEqual(egr_tensor20.shape, [3, 3]) @@ -547,7 +547,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): paddle.fluid.framework._current_expected_place())) self.assertTrue(np.array_equal(egr_tensor20.numpy(), x)) - egr_tensor21 = core.eager.EagerTensor(value=t, place=place) + egr_tensor21 = core.eager.Tensor(value=t, place=place) self.assertEqual(egr_tensor21.persistable, False) self.assertTrue("generated_tensor" in egr_tensor21.name) self.assertEqual(egr_tensor21.shape, [3, 3]) @@ -556,7 +556,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor21.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor21.numpy(), x)) - egr_tensor22 = core.eager.EagerTensor(t, place=place) + egr_tensor22 = core.eager.Tensor(t, place=place) self.assertEqual(egr_tensor22.persistable, False) self.assertTrue("generated_tensor" in egr_tensor22.name) self.assertEqual(egr_tensor22.shape, [3, 3]) @@ -565,8 +565,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor22.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor22.numpy(), x)) - egr_tensor23 = core.eager.EagerTensor( - t, place, name="from_framework_tensor") + egr_tensor23 = core.eager.Tensor(t, place, name="from_framework_tensor") self.assertEqual(egr_tensor23.persistable, False) self.assertTrue("from_framework_tensor" in egr_tensor23.name) self.assertEqual(egr_tensor23.shape, [3, 3]) @@ -575,7 +574,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor23.place._equals(place)) self.assertTrue(np.array_equal(egr_tensor23.numpy(), x)) - egr_tensor24 = core.eager.EagerTensor( + egr_tensor24 = core.eager.Tensor( value=t, place=place, name="from_framework_tensor") self.assertEqual(egr_tensor24.persistable, False) self.assertTrue("from_framework_tensor" in egr_tensor24.name) @@ -587,7 +586,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): # Bad usage # SyntaxError: positional argument follows keyword argument - # egr_tensor25 = core.eager.EagerTensor(value=t, place) + # egr_tensor25 = core.eager.Tensor(value=t, place) def test_constructor_with_kwargs(self): print("Test_constructor_with_kwargs") @@ -655,7 +654,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): tensor2 = None tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32, core.CPUPlace()) - tensor3 = core.eager.EagerTensor() + tensor3 = core.eager.Tensor() if core.is_compiled_with_cuda(): tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)) @@ -683,7 +682,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): tensor2 = None tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32, core.CPUPlace()) - tensor3 = core.eager.EagerTensor() + tensor3 = core.eager.Tensor() if core.is_compiled_with_cuda(): tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)) @@ -748,7 +747,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): with _test_eager_guard(): arr = np.random.rand(4, 16, 16, 32).astype('float64') - egr_tensor0 = core.eager.EagerTensor(value=arr) + egr_tensor0 = core.eager.Tensor(value=arr) self.assertEqual(egr_tensor0.persistable, False) self.assertTrue("generated" in egr_tensor0.name) self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32]) @@ -766,7 +765,7 @@ class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase): def test_set_value(self): with _test_eager_guard(): ori_arr = np.random.rand(4, 16, 16, 32).astype('float32') - egr_tensor = core.eager.EagerTensor(value=ori_arr) + egr_tensor = core.eager.Tensor(value=ori_arr) self.assertEqual(egr_tensor.stop_gradient, True) self.assertEqual(egr_tensor.shape, [4, 16, 16, 32]) self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr)) @@ -859,7 +858,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase): def test_backward_with_single_tensor(self): with _test_eager_guard(): arr4 = np.random.rand(4, 16, 16, 32).astype('float32') - egr_tensor12 = core.eager.EagerTensor(arr4, core.CPUPlace()) + egr_tensor12 = core.eager.Tensor(arr4, core.CPUPlace()) egr_tensor12.retain_grads() arr = np.ones([4, 16, 16, 32]).astype('float32') self.assertEqual(egr_tensor12.persistable, False) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 92d3dd7b6054b685cb5b560c20ebf2e249f640fe..a36b10f58ffaa503b6ccca580843f07b4bbfc2ac 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -203,7 +203,7 @@ class TestImperative(unittest.TestCase): with fluid.dygraph.guard(): if fluid.framework._in_eager_mode(): var_base = paddle.to_tensor(np.array([3, 4, 5])) - self.assertTrue(isinstance(var_base, core.eager.EagerTensor)) + self.assertTrue(isinstance(var_base, core.eager.Tensor)) else: var_base = paddle.to_tensor(np.array([3, 4, 5])) self.assertTrue(isinstance(var_base, core.VarBase)) @@ -221,13 +221,13 @@ class TestImperative(unittest.TestCase): t.set(x, fluid.CPUPlace()) if _in_eager_mode(): # TODO(jiabin): Support Kwargs and uncomment these tests - # egr_tmp = fluid.core.eager.EagerTensor(value=x, place=fluid.core.CPUPlace()) - egr_tmp2 = fluid.core.eager.EagerTensor(y, fluid.core.CPUPlace()) + # egr_tmp = fluid.core.eager.Tensor(value=x, place=fluid.core.CPUPlace()) + egr_tmp2 = fluid.core.eager.Tensor(y, fluid.core.CPUPlace()) egr_tmp3 = paddle.to_tensor(x) - egr_tmp4 = fluid.core.eager.EagerTensor(y) - # egr_tmp5 = fluid.core.eager.EagerTensor(value=x) + egr_tmp4 = fluid.core.eager.Tensor(y) + # egr_tmp5 = fluid.core.eager.Tensor(value=x) # TODO(jiabin): Support it when we merge LoDTensor with DenseTensor - egr_tmp6 = fluid.core.eager.EagerTensor(t) + egr_tmp6 = fluid.core.eager.Tensor(t) # self.assertTrue(np.array_equal(x, egr_tmp.numpy())) self.assertTrue(np.array_equal(y, egr_tmp2.numpy())) @@ -953,8 +953,7 @@ class TestMetaclass(unittest.TestCase): self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type') if core._in_eager_mode(): self.assertEqual( - type(paddle.fluid.core.eager.EagerTensor).__name__, - 'pybind11_type') + type(paddle.fluid.core.eager.Tensor).__name__, 'pybind11_type') else: self.assertEqual( type(paddle.fluid.core.VarBase).__name__, 'pybind11_type') diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py index 317353684317f6fa0e8cf37cda58f2041e70befd..4c457e9345c5d35aef1d221b1f744e4f93367eec 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,14 +25,15 @@ import paddle.fluid.core as core import paddle.fluid.dygraph.base as base from test_imperative_lod_tensor_to_selected_rows import SimpleNet +from paddle.fluid.framework import _test_eager_guard -call_forward_hook = False +call_forward_post_hook = False call_forward_pre_hook = False -def forward_hook(layer, input, output): - global call_forward_hook - call_forward_hook = True +def forward_post_hook(layer, input, output): + global call_forward_post_hook + call_forward_post_hook = True def forward_pre_hook(layer, input): @@ -40,7 +41,7 @@ def forward_pre_hook(layer, input): call_forward_pre_hook = True -def forward_hook1(layer, input, output): +def forward_post_hook1(layer, input, output): return output * 2 @@ -50,8 +51,8 @@ def forward_pre_hook1(layer, input): class Test_Forward_Hook(unittest.TestCase): - # test forward_pre_hook and forward_hook that have return value - def test_forward_hook_return_value(self): + # test forward_pre_hook and forward_post_hook that have return value + def func_forward_hook_return_value(self): seed = 90 places = [fluid.CPUPlace()] @@ -104,23 +105,23 @@ class Test_Forward_Hook(unittest.TestCase): self.assertTrue( np.array_equal(outs_pre_hook.numpy(), outs_origin.numpy())) - # register forward_hook - forward_hook_handle1 = simplenet.register_forward_post_hook( - forward_hook1) + # register forward_posst_hook + forward_post_hook_handle1 = simplenet.register_forward_post_hook( + forward_post_hook1) outs_forward_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_forward_hook.numpy(), outs_origin.numpy() * 2)) - # remove forward_hook - forward_hook_handle1.remove() + # remove forward_post_hook + forward_post_hook_handle1.remove() outs_forward_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_forward_hook.numpy(), outs_origin.numpy())) - # test forward_pre_hook and forward_hook that don't have return value - def test_forward_hook(self): + # test forward_pre_hook and forward_post_hook that don't have return value + def func_forward_hook(self): seed = 90 places = [fluid.CPUPlace()] @@ -133,7 +134,7 @@ class Test_Forward_Hook(unittest.TestCase): fluid.default_main_program().random_seed = seed fluid.set_flags({'FLAGS_sort_sum_gradient': True}) - global call_forward_hook + global call_forward_post_hook global call_forward_pre_hook input_word = np.array( @@ -158,38 +159,45 @@ class Test_Forward_Hook(unittest.TestCase): # origin, don't register any hook outs_origin = simplenet(input, y) - self.assertFalse(call_forward_hook) + self.assertFalse(call_forward_post_hook) self.assertFalse(call_forward_pre_hook) - # register forward_hook and forward_pre_hook - forward_hook_handle = simplenet.register_forward_post_hook( - forward_hook) + # register forward_post_hook and forward_pre_hook + forward_post_hook_handle = simplenet.register_forward_post_hook( + forward_post_hook) forward_pre_hook_handle = simplenet.register_forward_pre_hook( forward_pre_hook) outs_hook = simplenet(input, y) - self.assertTrue(call_forward_hook) + self.assertTrue(call_forward_post_hook) self.assertTrue(call_forward_pre_hook) outs_hook = simplenet(input, y) - self.assertTrue(call_forward_hook) + self.assertTrue(call_forward_post_hook) self.assertTrue(call_forward_pre_hook) - # remove forward_hook - forward_hook_handle.remove() - call_forward_hook = False + # remove forward_post_hook + forward_post_hook_handle.remove() + call_forward_post_hook = False call_forward_pre_hook = False outs_remove_forward_hook = simplenet(input, y) - self.assertFalse(call_forward_hook) + self.assertFalse(call_forward_post_hook) self.assertTrue(call_forward_pre_hook) # remove forward_pre_hook forward_pre_hook_handle.remove() - call_forward_hook = False + call_forward_post_hook = False call_forward_pre_hook = False outs_remove_hook = simplenet(input, y) - self.assertFalse(call_forward_hook) + self.assertFalse(call_forward_post_hook) self.assertFalse(call_forward_pre_hook) + def test_forward_hook_return_value(self): + with _test_eager_guard(): + self.func_forward_hook() + self.func_forward_hook_return_value() + self.func_forward_hook() + self.func_forward_hook_return_value() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py index 7b8d31ff030e503f872b9afd923ce4c6252a026a..1881f1bbbd4c330c522a6304ea3fe004fafbeb3b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py @@ -41,7 +41,7 @@ class TestImperativeNumpyBridge(unittest.TestCase): data_np[0][0] = -1 self.assertEqual(data_np[0][0], -1) if _in_eager_mode(): - # eager_mode, var2 is EagerTensor, is not subscriptable + # eager_mode, var2 is Tensor, is not subscriptable # TODO(wuweilong): to support slice in eager mode later self.assertNotEqual(var2.numpy()[0][0], -1) else: diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 1bd5e08e28ef0f151b8c78b2537a08c66dd20e22..36038d656b7736afc94da32c29c56ce61b338cb4 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -34,6 +34,7 @@ from test_imperative_base import new_program_scope from paddle.fluid.dygraph import nn from paddle.fluid.dygraph import base from paddle.fluid.dygraph import to_variable +from paddle.fluid.framework import _test_eager_guard class LayerTest(unittest.TestCase): @@ -98,6 +99,14 @@ class TestLayer(LayerTest): return ret with self.dynamic_graph(): + with _test_eager_guard(): + inp = np.ones([3, 3], dtype='float32') + x = base.to_variable(inp) + custom = CustomLayer(input_size=3, linear1_size=2) + ret = custom(x, do_linear2=False) + self.assertTrue(np.array_equal(ret.numpy().shape, [3, 2])) + ret = custom(x, do_linear2=True) + self.assertTrue(np.array_equal(ret.numpy().shape, [3, 1])) inp = np.ones([3, 3], dtype='float32') x = base.to_variable(inp) custom = CustomLayer(input_size=3, linear1_size=2) @@ -121,6 +130,15 @@ class TestLayer(LayerTest): static_ret, static_ret2 = self.get_static_graph_result( feed={'data': inp}, fetch_list=[ret, ret2]) with self.dynamic_graph(): + with _test_eager_guard(): + t = base.to_variable(inp) + dropout = nn.Dropout(p=0.35, seed=1, is_test=False) + dy_eager_ret = dropout(t) + dy_eager_ret2 = fluid.layers.dropout( + t, dropout_prob=0.35, seed=1, is_test=False) + dy_eager_ret_value = dy_eager_ret.numpy() + dy_eager_ret2_value = dy_eager_ret2.numpy() + t = base.to_variable(inp) dropout = nn.Dropout(p=0.35, seed=1, is_test=False) dy_ret = dropout(t) @@ -129,6 +147,9 @@ class TestLayer(LayerTest): dy_ret_value = dy_ret.numpy() dy_ret2_value = dy_ret2.numpy() + self.assertTrue(np.array_equal(dy_eager_ret_value, dy_eager_ret2_value)) + self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) + self.assertTrue(np.array_equal(static_ret, static_ret2)) self.assertTrue(np.array_equal(dy_ret_value, dy_ret2_value)) self.assertTrue(np.array_equal(static_ret, dy_ret_value)) @@ -147,12 +168,22 @@ class TestLayer(LayerTest): static_ret = self.get_static_graph_result( feed={'data': inp}, fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + t = base.to_variable(inp) + linear = nn.Linear( + 32, + 4, + bias_attr=fluid.initializer.ConstantInitializer(value=1)) + dy_eager_ret = linear(t) + dy_eager_ret_value = dy_eager_ret.numpy() + t = base.to_variable(inp) linear = nn.Linear( 32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1)) dy_ret = linear(t) dy_ret_value = dy_ret.numpy() + self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) self.assertTrue(np.array_equal(static_ret, dy_ret_value)) with self.static_graph(): @@ -193,11 +224,18 @@ class TestLayer(LayerTest): static_ret = self.get_static_graph_result( feed={'data': inp}, fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + t = base.to_variable(inp) + flatten = nn.Flatten() + dy_eager_ret = flatten(t) + dy_eager_ret_value = dy_eager_ret.numpy() + t = base.to_variable(inp) flatten = nn.Flatten() dy_ret = flatten(t) dy_ret_value = dy_ret.numpy() + self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) self.assertTrue(np.array_equal(static_ret, dy_ret_value)) with self.static_graph(): @@ -253,13 +291,35 @@ class TestLayer(LayerTest): static_ret2 = self.get_static_graph_result( feed={'data': inp}, fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + lm = nn.LayerNorm( + normalized_shape=[32, 32], + bias_attr=fluid.initializer.ConstantInitializer(value=1), + act='sigmoid') + dy_eager_ret = lm(base.to_variable(inp)) + dy_eager_ret_value = dy_eager_ret.numpy() + lm = nn.LayerNorm( normalized_shape=[32, 32], bias_attr=fluid.initializer.ConstantInitializer(value=1), act='sigmoid') dy_ret = lm(base.to_variable(inp)) dy_ret_value = dy_ret.numpy() + with self.dynamic_graph(): + with _test_eager_guard(): + lm = nn.LayerNorm( + normalized_shape=[32, 32], + shift=False, + scale=False, + param_attr=fluid.initializer.ConstantInitializer(value=1), + bias_attr=fluid.initializer.ConstantInitializer(value=1), + act='sigmoid') + lm(base.to_variable(inp)) + + self.assertFalse(hasattr(lm, "_scale_w")) + self.assertFalse(hasattr(lm, "_bias_w")) + lm = nn.LayerNorm( normalized_shape=[32, 32], shift=False, @@ -273,9 +333,18 @@ class TestLayer(LayerTest): self.assertFalse(hasattr(lm, "_bias_w")) self.assertTrue(np.array_equal(static_ret, static_ret2)) + self.assertTrue(np.array_equal(dy_eager_ret_value, static_ret2)) self.assertTrue(np.array_equal(dy_ret_value, static_ret2)) with self.dynamic_graph(): + with _test_eager_guard(): + lm = nn.LayerNorm( + normalized_shape=[16, 32], + bias_attr=fluid.initializer.ConstantInitializer(value=1), + act='sigmoid') + with self.assertRaises(ValueError): + lm(base.to_variable(inp)) + lm = nn.LayerNorm( normalized_shape=[16, 32], bias_attr=fluid.initializer.ConstantInitializer(value=1), @@ -295,11 +364,18 @@ class TestLayer(LayerTest): fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + t = np.ones([3, 3, 5, 5], dtype='float32') + my_syncbn = paddle.nn.SyncBatchNorm(3) + dy_eager_ret = my_syncbn(base.to_variable(t)) + dy_eager_ret_value = dy_eager_ret.numpy() + t = np.ones([3, 3, 5, 5], dtype='float32') my_syncbn = paddle.nn.SyncBatchNorm(3) dy_ret = my_syncbn(base.to_variable(t)) dy_ret_value = dy_ret.numpy() self.assertTrue(np.array_equal(static_ret, dy_ret_value)) + self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) def test_relu(self): with self.static_graph(): @@ -310,11 +386,17 @@ class TestLayer(LayerTest): [3, 3], dtype='float32')}, fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + t = np.ones([3, 3], dtype='float32') + dy_eager_ret = layers.relu(base.to_variable(t)) + dy_eager_ret_value = dy_eager_ret.numpy() + t = np.ones([3, 3], dtype='float32') dy_ret = layers.relu(base.to_variable(t)) dy_ret_value = dy_ret.numpy() self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_ret_value)) def test_matmul(self): with self.static_graph(): @@ -331,12 +413,20 @@ class TestLayer(LayerTest): fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + t = np.ones([3, 3], dtype='float32') + t2 = np.ones([3, 3], dtype='float32') + dy_eager_ret = layers.matmul( + base.to_variable(t), base.to_variable(t2)) + dy_eager_ret_value = dy_eager_ret.numpy() + t = np.ones([3, 3], dtype='float32') t2 = np.ones([3, 3], dtype='float32') dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2)) dy_ret_value = dy_ret.numpy() self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_ret_value)) def test_conv2d(self): with self.static_graph(): @@ -358,6 +448,13 @@ class TestLayer(LayerTest): fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 5, 5], dtype='float32') + conv2d = nn.Conv2D( + num_channels=3, num_filters=3, filter_size=[2, 2]) + dy_eager_ret = conv2d(base.to_variable(images)) + dy_eager_ret_value = dy_eager_ret.numpy() + images = np.ones([2, 3, 5, 5], dtype='float32') conv2d = nn.Conv2D( num_channels=3, num_filters=3, filter_size=[2, 2]) @@ -365,6 +462,16 @@ class TestLayer(LayerTest): dy_ret_value = dy_ret.numpy() with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 5, 5], dtype='float32') + conv2d = nn.Conv2D( + num_channels=3, + num_filters=3, + filter_size=[2, 2], + bias_attr=False) + dy_ret = conv2d(base.to_variable(images)) + self.assertTrue(conv2d.bias is None) + images = np.ones([2, 3, 5, 5], dtype='float32') conv2d = nn.Conv2D( num_channels=3, @@ -396,9 +503,49 @@ class TestLayer(LayerTest): self.assertRaises(TypeError, test_type) self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_ret_value)) self.assertTrue(np.allclose(static_ret, static_ret2)) with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 5, 5], dtype='float32') + custom_weight = np.random.randn(3, 3, 2, 2).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + conv2d1 = nn.Conv2D( + num_channels=3, num_filters=3, filter_size=[2, 2]) + conv2d2 = nn.Conv2D( + num_channels=3, + num_filters=3, + filter_size=[2, 2], + param_attr=weight_attr) + dy_ret1 = conv2d1(base.to_variable(images)) + dy_ret2 = conv2d2(base.to_variable(images)) + self.assertFalse( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv2d1_weight_np = conv2d1.weight.numpy() + conv2d1_bias = conv2d1.bias + self.assertFalse( + np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + conv2d2.weight.set_value(conv2d1_weight_np) + self.assertTrue( + np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + conv2d2.bias.set_value(conv2d1_bias) + dy_ret1 = conv2d1(base.to_variable(images)) + dy_ret2 = conv2d2(base.to_variable(images)) + self.assertTrue( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv2d2.weight = conv2d1.weight + conv2d2.bias = conv2d1.bias + self.assertTrue( + np.array_equal(conv2d1.weight.numpy(), + conv2d2.weight.numpy())) + self.assertTrue( + np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy())) + images = np.ones([2, 3, 5, 5], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2).astype("float32") weight_attr = fluid.ParamAttr( @@ -467,6 +614,14 @@ class TestLayer(LayerTest): fetch_list=[updated_hidden, reset_hidden_pre, gate]) with self.dynamic_graph(): + with _test_eager_guard(): + gru = nn.GRUUnit(size=D * 3) + dy_eager_ret = gru( + base.to_variable(input), base.to_variable(hidden_input)) + dy_eager_ret_value = [] + for i in range(len(static_ret)): + dy_eager_ret_value.append(dy_eager_ret[i].numpy()) + gru = nn.GRUUnit(size=D * 3) dy_ret = gru( base.to_variable(input), base.to_variable(hidden_input)) @@ -477,8 +632,40 @@ class TestLayer(LayerTest): for i in range(len(static_ret)): self.assertTrue(np.allclose(static_ret[i], static_ret2[i])) self.assertTrue(np.allclose(static_ret[i], dy_ret_value[i])) + self.assertTrue(np.allclose(static_ret[i], dy_eager_ret_value[i])) with self.dynamic_graph(): + with _test_eager_guard(): + custom_weight = np.random.randn(D, D * 3).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + gru1 = nn.GRUUnit(size=D * 3) + gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr) + dy_ret1 = gru1( + base.to_variable(input), base.to_variable(hidden_input)) + dy_ret2 = gru2( + base.to_variable(input), base.to_variable(hidden_input)) + self.assertFalse( + np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())) + for o1, o2 in zip(dy_ret1, dy_ret2): + self.assertFalse(np.array_equal(o1.numpy(), o2.numpy())) + gru2.weight.set_value(gru1.weight.numpy()) + gru2.bias.set_value(gru1.bias) + dy_ret1 = gru1( + base.to_variable(input), base.to_variable(hidden_input)) + dy_ret2 = gru2( + base.to_variable(input), base.to_variable(hidden_input)) + for o1, o2 in zip(dy_ret1, dy_ret2): + self.assertTrue(np.array_equal(o1.numpy(), o2.numpy())) + + gru2.weight = gru1.weight + gru2.bias = gru1.bias + self.assertTrue( + np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())) + self.assertTrue( + np.array_equal(gru1.bias.numpy(), gru2.bias.numpy())) + custom_weight = np.random.randn(D, D * 3).astype("float32") weight_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( @@ -543,19 +730,37 @@ class TestLayer(LayerTest): fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + ret = layers.elementwise_add(to_variable(n), to_variable(n2)) + ret = layers.elementwise_pow(ret, to_variable(n3)) + ret = layers.elementwise_div(ret, to_variable(n4)) + ret = layers.elementwise_sub(ret, to_variable(n5)) + dy_eager_ret = layers.elementwise_mul(ret, to_variable(n6)) + dy_eager_ret_value = dy_eager_ret.numpy() + ret = layers.elementwise_add(to_variable(n), to_variable(n2)) ret = layers.elementwise_pow(ret, to_variable(n3)) ret = layers.elementwise_div(ret, to_variable(n4)) ret = layers.elementwise_sub(ret, to_variable(n5)) dy_ret = layers.elementwise_mul(ret, to_variable(n6)) dy_ret_value = dy_ret.numpy() + self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_ret_value)) def test_elementwise_minmax(self): n = np.ones([3, 3], dtype='float32') n2 = np.ones([3, 3], dtype='float32') * 2 with self.dynamic_graph(): + with _test_eager_guard(): + min_eager_ret = layers.elementwise_min( + to_variable(n), to_variable(n2)) + max_eager_ret = layers.elementwise_max( + to_variable(n), to_variable(n2)) + min_eager_ret_value = min_eager_ret.numpy() + max_eager_ret_value = max_eager_ret.numpy() + min_ret = layers.elementwise_min(to_variable(n), to_variable(n2)) max_ret = layers.elementwise_max(to_variable(n), to_variable(n2)) min_ret_value = min_ret.numpy() @@ -563,6 +768,8 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(n, min_ret_value)) self.assertTrue(np.allclose(n2, max_ret_value)) + self.assertTrue(np.allclose(n, min_eager_ret_value)) + self.assertTrue(np.allclose(n2, max_eager_ret_value)) def test_sequence_conv(self): inp_np = np.arange(12).reshape([3, 4]).astype('float32') @@ -633,6 +840,16 @@ class TestLayer(LayerTest): static_rlt2 = self.get_static_graph_result( feed={'pixel': inp_np}, fetch_list=[out])[0] with self.dynamic_graph(): + with _test_eager_guard(): + conv2d_transpose = nn.Conv2DTranspose( + num_channels=3, + num_filters=10, + filter_size=27, + act='sigmoid', + bias_attr=fluid.initializer.ConstantInitializer(value=1)) + dy_eager_rlt = conv2d_transpose(base.to_variable(inp_np)) + dy_eager_rlt_value = dy_eager_rlt.numpy() + conv2d_transpose = nn.Conv2DTranspose( num_channels=3, num_filters=10, @@ -643,8 +860,48 @@ class TestLayer(LayerTest): dy_rlt_value = dy_rlt.numpy() self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(dy_rlt_value, static_rlt2)) + self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt2)) with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 5, 5], dtype='float32') + custom_weight = np.random.randn(3, 3, 2, 2).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + conv2d1 = nn.Conv2DTranspose( + num_channels=3, num_filters=3, filter_size=[2, 2]) + conv2d2 = nn.Conv2DTranspose( + num_channels=3, + num_filters=3, + filter_size=[2, 2], + param_attr=weight_attr) + dy_ret1 = conv2d1(base.to_variable(images)) + dy_ret2 = conv2d2(base.to_variable(images)) + self.assertFalse( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv2d1_weight_np = conv2d1.weight.numpy() + conv2d1_bias = conv2d1.bias + self.assertFalse( + np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + conv2d2.weight.set_value(conv2d1_weight_np) + self.assertTrue( + np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + conv2d2.bias.set_value(conv2d1_bias) + dy_ret1 = conv2d1(base.to_variable(images)) + dy_ret2 = conv2d2(base.to_variable(images)) + self.assertTrue( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv2d2.weight = conv2d1.weight + conv2d2.bias = conv2d1.bias + self.assertTrue( + np.array_equal(conv2d1.weight.numpy(), + conv2d2.weight.numpy())) + self.assertTrue( + np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy())) + images = np.ones([2, 3, 5, 5], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2).astype("float32") weight_attr = fluid.ParamAttr( @@ -750,6 +1007,17 @@ class TestLayer(LayerTest): feed={'x': inp_np_x, 'y': inp_np_y}, fetch_list=[out])[0] with self.dynamic_graph(): + with _test_eager_guard(): + btp = nn.BilinearTensorProduct( + 3, + 3, + 6, + bias_attr=fluid.initializer.ConstantInitializer(value=1), + act='sigmoid') + dy_eager_rlt = btp( + base.to_variable(inp_np_x), base.to_variable(inp_np_y)) + dy_eager_rlt_value = dy_eager_rlt.numpy() + btp = nn.BilinearTensorProduct( 3, 3, @@ -758,11 +1026,19 @@ class TestLayer(LayerTest): act='sigmoid') dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y)) dy_rlt_value = dy_rlt.numpy() + with self.dynamic_graph(): + with _test_eager_guard(): + btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid') + dy_eager_rlt2 = btp2( + base.to_variable(inp_np_x), base.to_variable(inp_np_y)) + dy_eager_rlt2_value = dy_eager_rlt2.numpy() + btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid') dy_rlt2 = btp2( base.to_variable(inp_np_x), base.to_variable(inp_np_y)) dy_rlt2_value = dy_rlt2.numpy() + with self.static_graph(): data_x2 = layers.data( name='x', @@ -782,10 +1058,42 @@ class TestLayer(LayerTest): 'y': inp_np_y}, fetch_list=[out2])[0] self.assertTrue(np.array_equal(dy_rlt2_value, static_rlt3)) + self.assertTrue(np.array_equal(dy_eager_rlt2_value, static_rlt3)) self.assertTrue(np.array_equal(static_rlt2, static_rlt)) self.assertTrue(np.array_equal(dy_rlt_value, static_rlt)) + self.assertTrue(np.array_equal(dy_eager_rlt_value, static_rlt)) with self.dynamic_graph(): + with _test_eager_guard(): + custom_weight = np.random.randn(6, 3, 3).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + btp1 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid') + btp2 = nn.BilinearTensorProduct( + 3, 3, 6, act='sigmoid', param_attr=weight_attr) + dy_rlt1 = btp1( + base.to_variable(inp_np_x), base.to_variable(inp_np_y)) + dy_rlt2 = btp2( + base.to_variable(inp_np_x), base.to_variable(inp_np_y)) + self.assertFalse( + np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())) + btp2.weight.set_value(btp1.weight.numpy()) + btp2.bias.set_value(btp1.bias) + dy_rlt1 = btp1( + base.to_variable(inp_np_x), base.to_variable(inp_np_y)) + dy_rlt2 = btp2( + base.to_variable(inp_np_x), base.to_variable(inp_np_y)) + self.assertTrue( + np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())) + + btp2.weight = btp1.weight + btp2.bias = btp1.bias + self.assertTrue( + np.array_equal(btp1.weight.numpy(), btp2.weight.numpy())) + self.assertTrue( + np.array_equal(btp1.bias.numpy(), btp2.bias.numpy())) + custom_weight = np.random.randn(6, 3, 3).astype("float32") weight_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( @@ -842,6 +1150,15 @@ class TestLayer(LayerTest): feed={"input": inp_np}, fetch_list=[out])[0] with self.dynamic_graph(): + with _test_eager_guard(): + prelu = nn.PRelu( + mode=mode, + channel=inp_np.shape[1], + input_shape=inp_np.shape, + param_attr=ParamAttr(initializer=Constant(1.0))) + dy_eager_rlt = prelu(base.to_variable(inp_np)) + dy_eager_rlt_value = dy_eager_rlt.numpy() + prelu = nn.PRelu( mode=mode, channel=inp_np.shape[1], @@ -852,8 +1169,40 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(dy_rlt_value, static_rlt)) + self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt)) with self.dynamic_graph(): + with _test_eager_guard(): + inp_np = np.random.randn(5, 200, 100, 100).astype("float32") + inp = base.to_variable(inp_np) + prelu1 = nn.PRelu( + mode=mode, + channel=inp_np.shape[1], + input_shape=inp_np.shape, + param_attr=ParamAttr(initializer=Constant(2.0))) + prelu2 = nn.PRelu( + mode=mode, + channel=inp_np.shape[1], + input_shape=inp_np.shape, + param_attr=ParamAttr(initializer=Constant(1.0))) + dy_rlt1 = prelu1(inp) + dy_rlt2 = prelu2(inp) + self.assertFalse( + np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy( + ))) + self.assertFalse( + np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())) + prelu2.weight.set_value(prelu1.weight.numpy()) + dy_rlt1 = prelu1(inp) + dy_rlt2 = prelu2(inp) + self.assertTrue( + np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())) + + prelu2.weight = prelu1.weight + self.assertTrue( + np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy( + ))) + inp_np = np.random.randn(5, 200, 100, 100).astype("float32") inp = base.to_variable(inp_np) prelu1 = nn.PRelu( @@ -905,6 +1254,14 @@ class TestLayer(LayerTest): static_rlt2 = self.get_static_graph_result( feed={'word': inp_word}, fetch_list=[emb_rlt])[0] with self.dynamic_graph(): + with _test_eager_guard(): + emb2 = nn.Embedding( + size=[dict_size, 32], + param_attr='eager_emb.w', + is_sparse=False) + dy_eager_rlt = emb2(base.to_variable(inp_word)) + dy_eager_rlt_value = dy_eager_rlt.numpy() + emb2 = nn.Embedding( size=[dict_size, 32], param_attr='emb.w', is_sparse=False) dy_rlt = emb2(base.to_variable(inp_word)) @@ -912,8 +1269,34 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(dy_rlt_value, static_rlt)) + self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt)) with self.dynamic_graph(): + with _test_eager_guard(): + custom_weight = np.random.randn(dict_size, 32).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False) + emb2 = nn.Embedding( + size=[dict_size, 32], + param_attr=weight_attr, + is_sparse=False) + rep1 = emb1(base.to_variable(inp_word)) + rep2 = emb2(base.to_variable(inp_word)) + self.assertFalse( + np.array_equal(emb1.weight.numpy(), custom_weight)) + self.assertTrue( + np.array_equal(emb2.weight.numpy(), custom_weight)) + self.assertFalse(np.array_equal(rep1.numpy(), rep2.numpy())) + emb2.weight.set_value(emb1.weight.numpy()) + rep2 = emb2(base.to_variable(inp_word)) + self.assertTrue(np.array_equal(rep1.numpy(), rep2.numpy())) + + emb2.weight = emb1.weight + self.assertTrue( + np.array_equal(emb1.weight.numpy(), emb2.weight.numpy())) + custom_weight = np.random.randn(dict_size, 32).astype("float32") weight_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( @@ -978,6 +1361,7 @@ class TestLayer(LayerTest): feed_dict['word_{0}'.format(i)] = inp_word[i] static_rlt = self.get_static_graph_result( feed=feed_dict, fetch_list=[nce_loss])[0] + with self.static_graph(): words = [] for i in range(window_size): @@ -1018,6 +1402,41 @@ class TestLayer(LayerTest): feed=feed_dict, fetch_list=[nce_loss2])[0] with self.dynamic_graph(): + with _test_eager_guard(): + words = [] + for i in range(window_size): + words.append(base.to_variable(inp_word[i])) + sample_weights = layers.fill_constant( + shape=[5, 1], dtype='float32', value=1) + emb = nn.Embedding( + size=[dict_size, 32], + param_attr='eager_emb.w', + is_sparse=False) + + embs3 = [] + for i in range(window_size): + if i == label_word: + continue + + emb_rlt = emb(words[i]) + embs3.append(emb_rlt) + + embs3 = layers.concat( + input=embs3, axis=fluid.dygraph.to_variable(np.array([1]))) + nce = nn.NCE(num_total_classes=dict_size, + dim=embs3.shape[1], + num_neg_samples=2, + sampler="custom_dist", + custom_dist=nid_freq_arr.tolist(), + seed=seed, + param_attr='eager_nce.w', + bias_attr='eager_nce.b', + sample_weight=sample_weights) + + wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) + dy_eager_rlt = nce(embs3, wl) + dy_eager_rlt_value = dy_eager_rlt.numpy() + words = [] for i in range(window_size): words.append(base.to_variable(inp_word[i])) @@ -1052,8 +1471,75 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(dy_rlt_value, static_rlt)) + self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt)) with self.dynamic_graph(): + with _test_eager_guard(): + custom_weight = np.random.randn(dict_size, + 128).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + words = [] + for i in range(window_size): + words.append(base.to_variable(inp_word[i])) + sample_weights = layers.fill_constant( + shape=fluid.dygraph.to_variable(np.array([5, 1])), + dtype='float32', + value=1) + emb = nn.Embedding( + size=[dict_size, 32], + param_attr='eager_emb.w', + is_sparse=False) + + embs3 = [] + for i in range(window_size): + if i == label_word: + continue + + emb_rlt = emb(words[i]) + embs3.append(emb_rlt) + + embs3 = layers.concat(input=embs3, axis=1) + nce1 = nn.NCE(num_total_classes=dict_size, + dim=embs3.shape[1], + num_neg_samples=2, + sampler="custom_dist", + custom_dist=nid_freq_arr.tolist(), + seed=seed, + param_attr='eager_nce1.w', + bias_attr='eager_nce1.b', + sample_weight=sample_weights) + + nce2 = nn.NCE(num_total_classes=dict_size, + dim=embs3.shape[1], + num_neg_samples=2, + sampler="custom_dist", + custom_dist=nid_freq_arr.tolist(), + seed=seed, + param_attr=weight_attr, + bias_attr='eager_nce2.b', + sample_weight=sample_weights) + + wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) + nce1_loss = nce1(embs3, wl) + nce2_loss = nce2(embs3, wl) + self.assertFalse( + np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())) + nce2.weight.set_value(nce1.weight.numpy()) + nce2.bias.set_value(nce1.bias) + nce1_loss = nce1(embs3, wl) + nce2_loss = nce2(embs3, wl) + self.assertTrue( + np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())) + + nce2.weight = nce1.weight + nce2.bias = nce1.bias + self.assertTrue( + np.array_equal(nce1.weight.numpy(), nce2.weight.numpy())) + self.assertTrue( + np.array_equal(nce1.bias.numpy(), nce2.bias.numpy())) + custom_weight = np.random.randn(dict_size, 128).astype("float32") weight_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( @@ -1118,6 +1604,17 @@ class TestLayer(LayerTest): def test_one_hot(self): with self.dynamic_graph(): + with _test_eager_guard(): + label = fluid.dygraph.to_variable( + np.array([[1], [1], [3], [0]])) + one_hot_label1 = fluid.layers.one_hot(input=label, depth=4) + one_hot_label2 = fluid.layers.one_hot( + input=label, + depth=fluid.dygraph.to_variable(np.array([4]))) + self.assertTrue( + np.array_equal(one_hot_label1.numpy(), + one_hot_label2.numpy())) + label = fluid.dygraph.to_variable(np.array([[1], [1], [3], [0]])) one_hot_label1 = fluid.layers.one_hot(input=label, depth=4) one_hot_label2 = fluid.layers.one_hot( @@ -1127,6 +1624,16 @@ class TestLayer(LayerTest): def test_split(self): with self.dynamic_graph(): + with _test_eager_guard(): + input = fluid.dygraph.to_variable(np.random.random((3, 8, 5))) + x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1) + x00, x11 = fluid.layers.split( + input, + num_or_sections=2, + dim=fluid.dygraph.to_variable(np.array([1]))) + self.assertTrue(np.array_equal(x0.numpy(), x00.numpy())) + self.assertTrue(np.array_equal(x1.numpy(), x11.numpy())) + input = fluid.dygraph.to_variable(np.random.random((3, 8, 5))) x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1) x00, x11 = fluid.layers.split( @@ -1138,6 +1645,17 @@ class TestLayer(LayerTest): def test_topk(self): with self.dynamic_graph(): + with _test_eager_guard(): + input = fluid.dygraph.to_variable(np.random.random((13, 11))) + top5_values1, top5_indices1 = layers.topk(input, k=5) + top5_values2, top5_indices2 = layers.topk( + input, k=fluid.dygraph.to_variable(np.array([5]))) + self.assertTrue( + np.array_equal(top5_values1.numpy(), top5_values2.numpy())) + self.assertTrue( + np.array_equal(top5_indices1.numpy(), top5_indices2.numpy( + ))) + input = fluid.dygraph.to_variable(np.random.random((13, 11))) top5_values1, top5_indices1 = layers.topk(input, k=5) top5_values2, top5_indices2 = layers.topk( @@ -1168,15 +1686,61 @@ class TestLayer(LayerTest): fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 6, 6, 6], dtype='float32') + conv3d = nn.Conv3D(num_channels=3, num_filters=3, filter_size=2) + dy_eager_ret = conv3d(base.to_variable(images)) + dy_eager_rlt_value = dy_eager_ret.numpy() + images = np.ones([2, 3, 6, 6, 6], dtype='float32') conv3d = nn.Conv3D(num_channels=3, num_filters=3, filter_size=2) dy_ret = conv3d(base.to_variable(images)) dy_rlt_value = dy_ret.numpy() self.assertTrue(np.allclose(static_ret, dy_rlt_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value)) self.assertTrue(np.allclose(static_ret, static_ret2)) with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 6, 6, 6], dtype='float32') + custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + conv3d1 = nn.Conv3D( + num_channels=3, num_filters=3, filter_size=2) + conv3d2 = nn.Conv3D( + num_channels=3, + num_filters=3, + filter_size=2, + param_attr=weight_attr) + dy_ret1 = conv3d1(base.to_variable(images)) + dy_ret2 = conv3d2(base.to_variable(images)) + self.assertFalse( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv3d1_weight_np = conv3d1.weight.numpy() + conv3d1_bias = conv3d1.bias + self.assertFalse( + np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + conv3d2.weight.set_value(conv3d1_weight_np) + self.assertTrue( + np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + conv3d1.bias.set_value(conv3d1_bias) + dy_ret1 = conv3d1(base.to_variable(images)) + dy_ret2 = conv3d2(base.to_variable(images)) + self.assertTrue( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv3d2.weight = conv3d1.weight + conv3d2.bias = conv3d1.bias + self.assertTrue( + np.array_equal(conv3d1.weight.numpy(), + conv3d2.weight.numpy())) + self.assertTrue( + np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy())) + images = np.ones([2, 3, 6, 6, 6], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32") weight_attr = fluid.ParamAttr( @@ -1309,6 +1873,7 @@ class TestLayer(LayerTest): with_lod=True)[0] with self.dynamic_graph(): + # TODO(wuweilong): Add with _test_eager_guard(): groupNorm = nn.GroupNorm( channels=shape[1], groups=2, @@ -1347,17 +1912,29 @@ class TestLayer(LayerTest): feed={'X': input}, fetch_list=[ret])[0] with self.dynamic_graph(): + with _test_eager_guard(): + instanceNorm = nn.InstanceNorm(num_channels=shape[1]) + dy_eager_ret = instanceNorm(base.to_variable(input)) + dy_eager_rlt_value = dy_eager_ret.numpy() + instanceNorm = nn.InstanceNorm(num_channels=shape[1]) dy_ret = instanceNorm(base.to_variable(input)) dy_rlt_value = dy_ret.numpy() with self.dynamic_graph(): + with _test_eager_guard(): + instanceNorm = nn.InstanceNorm(num_channels=shape[1]) + dy_eager_ret = instanceNorm(base.to_variable(input)) + dy_eager_rlt_value2 = dy_eager_ret.numpy() + instanceNorm = nn.InstanceNorm(num_channels=shape[1]) dy_ret = instanceNorm(base.to_variable(input)) dy_rlt_value2 = dy_ret.numpy() self.assertTrue(np.allclose(static_ret, dy_rlt_value)) self.assertTrue(np.allclose(static_ret, dy_rlt_value2)) + self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value2)) self.assertTrue(np.allclose(static_ret, static_ret2)) with self.static_graph(): @@ -1420,11 +1997,17 @@ class TestLayer(LayerTest): with_lod=True)[0] with self.dynamic_graph(): + with _test_eager_guard(): + spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2) + dy_eager_ret = spectralNorm(base.to_variable(input)) + dy_eager_rlt_value = dy_eager_ret.numpy() + spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2) dy_ret = spectralNorm(base.to_variable(input)) dy_rlt_value = dy_ret.numpy() self.assertTrue(np.allclose(static_ret, dy_rlt_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value)) self.assertTrue(np.allclose(static_ret, static_ret2)) def test_tree_conv(self): @@ -1492,6 +2075,13 @@ class TestLayer(LayerTest): with_lod=False)[0] with self.dynamic_graph(): + with _test_eager_guard(): + treeConv = nn.TreeConv( + feature_size=5, output_size=6, num_filters=1, max_depth=2) + dy_eager_ret = treeConv( + base.to_variable(vectors), base.to_variable(adj)) + dy_eager_rlt_value = dy_eager_ret.numpy() + treeConv = nn.TreeConv( feature_size=5, output_size=6, num_filters=1, max_depth=2) dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj)) @@ -1499,8 +2089,51 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_ret, static_ret2)) self.assertTrue(np.allclose(static_ret, dy_rlt_value)) + self.assertTrue(np.allclose(static_ret, dy_eager_rlt_value)) with self.dynamic_graph(): + with _test_eager_guard(): + custom_weight = np.random.randn(5, 3, 6, 1).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + treeConv1 = nn.TreeConv( + feature_size=5, + output_size=6, + num_filters=1, + max_depth=2, + bias_attr='eager_tc1_b') + treeConv2 = nn.TreeConv( + feature_size=5, + output_size=6, + num_filters=1, + max_depth=2, + param_attr=weight_attr, + bias_attr='eager_tc2_b') + dy_ret1 = treeConv1( + base.to_variable(vectors), base.to_variable(adj)) + dy_ret2 = treeConv2( + base.to_variable(vectors), base.to_variable(adj)) + self.assertFalse( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + treeConv2.weight.set_value(treeConv1.weight.numpy()) + treeConv2.bias.set_value(treeConv1.bias) + dy_ret1 = treeConv1( + base.to_variable(vectors), base.to_variable(adj)) + dy_ret2 = treeConv2( + base.to_variable(vectors), base.to_variable(adj)) + self.assertTrue( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + treeConv2.weight = treeConv1.weight + treeConv2.bias = treeConv1.bias + self.assertTrue( + np.array_equal(treeConv1.weight.numpy(), + treeConv2.weight.numpy())) + self.assertTrue( + np.array_equal(treeConv1.bias.numpy(), + treeConv2.bias.numpy())) + custom_weight = np.random.randn(5, 3, 6, 1).astype("float32") weight_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( @@ -1557,14 +2190,69 @@ class TestLayer(LayerTest): static_rlt2 = self.get_static_graph_result( feed={'pixel': input_array}, fetch_list=[out])[0] with self.dynamic_graph(): + with _test_eager_guard(): + conv3d_transpose = nn.Conv3DTranspose( + num_channels=3, + num_filters=12, + filter_size=12, + use_cudnn=False) + dy_eager_rlt = conv3d_transpose(base.to_variable(input_array)) + dy_eager_rlt_value = dy_eager_rlt.numpy() + conv3d_transpose = nn.Conv3DTranspose( num_channels=3, num_filters=12, filter_size=12, use_cudnn=False) dy_rlt = conv3d_transpose(base.to_variable(input_array)) dy_rlt_value = dy_rlt.numpy() self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(dy_rlt_value, static_rlt)) + self.assertTrue(np.allclose(dy_eager_rlt_value, static_rlt)) with self.dynamic_graph(): + with _test_eager_guard(): + images = np.ones([2, 3, 6, 6, 6], dtype='float32') + custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32") + weight_attr = fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + custom_weight)) + conv3d1 = nn.Conv3DTranspose( + num_channels=3, + num_filters=3, + filter_size=2, + bias_attr='eager_conv3d1_b', + use_cudnn=False) + conv3d2 = nn.Conv3DTranspose( + num_channels=3, + num_filters=3, + filter_size=2, + param_attr=weight_attr, + bias_attr='eager_conv3d2_b', + use_cudnn=False) + dy_ret1 = conv3d1(base.to_variable(images)) + dy_ret2 = conv3d2(base.to_variable(images)) + self.assertFalse( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv3d1_weight_np = conv3d1.weight.numpy() + conv3d1_bias = conv3d1.bias + self.assertFalse( + np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + conv3d2.weight.set_value(conv3d1_weight_np) + self.assertTrue( + np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + conv3d1.bias.set_value(conv3d1_bias) + dy_ret1 = conv3d1(base.to_variable(images)) + dy_ret2 = conv3d2(base.to_variable(images)) + self.assertTrue( + np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + + conv3d2.weight = conv3d1.weight + conv3d2.bias = conv3d1.bias + self.assertTrue( + np.array_equal(conv3d1.weight.numpy(), + conv3d2.weight.numpy())) + self.assertTrue( + np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy())) + images = np.ones([2, 3, 6, 6, 6], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32") weight_attr = fluid.ParamAttr( @@ -1614,6 +2302,20 @@ class TestLayer(LayerTest): stack_rlt2 = np.stack(array_rlt2, axis=0) with self.dynamic_graph(): + with _test_eager_guard(): + eager_eye_tensor = layers.eye(num_rows=3, num_columns=2) + eager_eye_tensor_rlt1 = layers.eye(num_rows=3, + num_columns=2, + batch_shape=[3]) + eager_eye_tensor_rlt2 = layers.eye(num_rows=3, + num_columns=2, + batch_shape=[4, 3]) + eager_diag_tensor = layers.eye(20) + eager_eye_tensor_value = eager_eye_tensor.numpy() + eager_eye_tensor_rlt1_value = eager_eye_tensor_rlt1.numpy() + eager_eye_tensor_rlt2_value = eager_eye_tensor_rlt2.numpy() + eager_diag_tensor_value = eager_diag_tensor.numpy() + eye_tensor = layers.eye(num_rows=3, num_columns=2) eye_tensor_rlt1 = layers.eye(num_rows=3, num_columns=2, @@ -1626,6 +2328,12 @@ class TestLayer(LayerTest): eye_tensor_rlt1_value = eye_tensor_rlt1.numpy() eye_tensor_rlt2_value = eye_tensor_rlt2.numpy() diag_tensor_value = diag_tensor.numpy() + + self.assertTrue(np.allclose(eager_eye_tensor_value, np_eye)) + self.assertTrue(np.allclose(eager_eye_tensor_rlt1_value, stack_rlt1)) + self.assertTrue(np.allclose(eager_eye_tensor_rlt2_value, stack_rlt2)) + self.assertTrue(np.allclose(eager_diag_tensor_value, np.eye(20))) + self.assertTrue(np.allclose(eye_tensor_value, np_eye)) self.assertTrue(np.allclose(eye_tensor_rlt1_value, stack_rlt1)) self.assertTrue(np.allclose(eye_tensor_rlt2_value, stack_rlt2)) @@ -1655,6 +2363,7 @@ class TestLayer(LayerTest): static_ret = self.get_static_graph_result(feed={}, fetch_list=out) with self.dynamic_graph(): + # TODO(wuweilong): Add with _test_eager_guard(): i = layers.fill_constant(shape=[1], dtype='int64', value=0) ten = layers.fill_constant(shape=[1], dtype='int64', value=10) @@ -1687,6 +2396,14 @@ class TestLayer(LayerTest): feed={"a": value_a, "b": value_b}, fetch_list=[cond])[0] with self.dynamic_graph(): + with _test_eager_guard(): + da = base.to_variable(value_a) + db = base.to_variable(value_b) + dcond = layers.less_than(x=da, y=db) + + for i in range(len(static_ret)): + self.assertTrue(dcond.numpy()[i] == static_ret[i]) + da = base.to_variable(value_a) db = base.to_variable(value_b) dcond = layers.less_than(x=da, y=db) @@ -1703,6 +2420,14 @@ class TestLayer(LayerTest): feed={"a1": value_a, "b1": value_b}, fetch_list=[cond1])[0] with self.dynamic_graph(): + with _test_eager_guard(): + da1 = base.to_variable(value_a) + db1 = base.to_variable(value_b) + dcond1 = layers.less_equal(x=da1, y=db1) + + for i in range(len(static_ret1)): + self.assertTrue(dcond1.numpy()[i] == static_ret1[i]) + da1 = base.to_variable(value_a) db1 = base.to_variable(value_b) dcond1 = layers.less_equal(x=da1, y=db1) @@ -1719,6 +2444,14 @@ class TestLayer(LayerTest): feed={"a2": value_a, "b2": value_b}, fetch_list=[cond2])[0] with self.dynamic_graph(): + with _test_eager_guard(): + da2 = base.to_variable(value_a) + db2 = base.to_variable(value_b) + dcond2 = layers.greater_than(x=da2, y=db2) + + for i in range(len(static_ret2)): + self.assertTrue(dcond2.numpy()[i] == static_ret2[i]) + da2 = base.to_variable(value_a) db2 = base.to_variable(value_b) dcond2 = layers.greater_than(x=da2, y=db2) @@ -1735,6 +2468,14 @@ class TestLayer(LayerTest): feed={"a3": value_a, "b3": value_b}, fetch_list=[cond3])[0] with self.dynamic_graph(): + with _test_eager_guard(): + da3 = base.to_variable(value_a) + db3 = base.to_variable(value_b) + dcond3 = layers.greater_equal(x=da3, y=db3) + + for i in range(len(static_ret3)): + self.assertTrue(dcond3.numpy()[i] == static_ret3[i]) + da3 = base.to_variable(value_a) db3 = base.to_variable(value_b) dcond3 = layers.greater_equal(x=da3, y=db3) @@ -1751,6 +2492,14 @@ class TestLayer(LayerTest): feed={"a4": value_a, "b4": value_b}, fetch_list=[cond4])[0] with self.dynamic_graph(): + with _test_eager_guard(): + da4 = base.to_variable(value_a) + db4 = base.to_variable(value_b) + dcond4 = layers.equal(x=da4, y=db4) + + for i in range(len(static_ret4)): + self.assertTrue(dcond4.numpy()[i] == static_ret4[i]) + da4 = base.to_variable(value_a) db4 = base.to_variable(value_b) dcond4 = layers.equal(x=da4, y=db4) @@ -1767,6 +2516,14 @@ class TestLayer(LayerTest): feed={"a5": value_a, "b5": value_b}, fetch_list=[cond5])[0] with self.dynamic_graph(): + with _test_eager_guard(): + da5 = base.to_variable(value_a) + db5 = base.to_variable(value_b) + dcond5 = layers.equal(x=da5, y=db5) + + for i in range(len(static_ret5)): + self.assertTrue(dcond5.numpy()[i] == static_ret5[i]) + da5 = base.to_variable(value_a) db5 = base.to_variable(value_b) dcond5 = layers.equal(x=da5, y=db5) @@ -1795,6 +2552,23 @@ class TestLayer(LayerTest): static_res = ret[0] with self.dynamic_graph(): + with _test_eager_guard(): + a = fluid.dygraph.to_variable(np.array([0.1]).astype('float32')) + b = fluid.dygraph.to_variable( + np.array([0.23]).astype('float32')) + out = layers.cond(a < b, lambda: less_than_branch(a, b), + lambda: greater_equal_branch(a, b)) + out2 = layers.cond(a >= b, lambda: greater_equal_branch(a, b), + lambda: less_than_branch(a, b)) + eager_dynamic_res = out.numpy() + eager_dynamic_res2 = out2.numpy() + self.assertTrue( + np.array_equal(eager_dynamic_res, eager_dynamic_res2)) + with self.assertRaises(TypeError): + layers.cond(a < b, 'str', 'str') + with self.assertRaises(TypeError): + layers.cond(a >= b, 'str', 'str') + a = fluid.dygraph.to_variable(np.array([0.1]).astype('float32')) b = fluid.dygraph.to_variable(np.array([0.23]).astype('float32')) out = layers.cond(a < b, lambda: less_than_branch(a, b), @@ -1810,6 +2584,7 @@ class TestLayer(LayerTest): layers.cond(a >= b, 'str', 'str') self.assertTrue(np.array_equal(static_res, dynamic_res)) + self.assertTrue(np.array_equal(static_res, eager_dynamic_res)) def test_case(self): def fn_1(): @@ -1840,6 +2615,23 @@ class TestLayer(LayerTest): static_res1, static_res2 = exe.run(fetch_list=[out_1, out_2]) with self.dynamic_graph(): + with _test_eager_guard(): + x = layers.fill_constant(shape=[1], dtype='float32', value=0.3) + y = layers.fill_constant(shape=[1], dtype='float32', value=0.1) + z = layers.fill_constant(shape=[1], dtype='float32', value=0.2) + + pred_1 = layers.less_than(z, x) # true: 0.2 < 0.3 + pred_2 = layers.less_than(x, y) # false: 0.3 < 0.1 + pred_3 = layers.equal(x, y) # false: 0.3 == 0.1 + + out_1 = layers.case( + pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], + default=fn_3) + out_2 = layers.case( + pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)]) + eager_dynamic_res1 = out_1.numpy() + eager_dynamic_res2 = out_2.numpy() + x = layers.fill_constant(shape=[1], dtype='float32', value=0.3) y = layers.fill_constant(shape=[1], dtype='float32', value=0.1) z = layers.fill_constant(shape=[1], dtype='float32', value=0.2) @@ -1856,6 +2648,8 @@ class TestLayer(LayerTest): self.assertTrue(np.array_equal(static_res1, dynamic_res1)) self.assertTrue(np.array_equal(static_res2, dynamic_res2)) + self.assertTrue(np.array_equal(static_res1, eager_dynamic_res1)) + self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2)) def test_switch_case(self): def fn_1(): @@ -1891,6 +2685,29 @@ class TestLayer(LayerTest): fetch_list=[out_1, out_2, out_3]) with self.dynamic_graph(): + with _test_eager_guard(): + index_1 = layers.fill_constant( + shape=[1], dtype='int32', value=1) + index_2 = layers.fill_constant( + shape=[1], dtype='int32', value=2) + + out_1 = layers.switch_case( + branch_index=index_1, + branch_fns={1: fn_1, + 2: fn_2}, + default=fn_3) + out_2 = layers.switch_case( + branch_index=index_2, + branch_fns=[(1, fn_1), (2, fn_2)], + default=fn_3) + out_3 = layers.switch_case( + branch_index=index_2, + branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)]) + + eager_dynamic_res1 = out_1.numpy() + eager_dynamic_res2 = out_2.numpy() + eager_dynamic_res3 = out_3.numpy() + index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1) index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2) @@ -1914,6 +2731,9 @@ class TestLayer(LayerTest): self.assertTrue(np.array_equal(static_res1, dynamic_res1)) self.assertTrue(np.array_equal(static_res2, dynamic_res2)) self.assertTrue(np.array_equal(static_res3, dynamic_res3)) + self.assertTrue(np.array_equal(static_res1, eager_dynamic_res1)) + self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2)) + self.assertTrue(np.array_equal(static_res3, eager_dynamic_res3)) def test_crop_tensor(self): with self.static_graph(): @@ -3281,6 +4101,14 @@ class TestBook(LayerTest): fetch_list=[output])[0] with self.dynamic_graph(): + with _test_eager_guard(): + x_dy = base.to_variable(x_np) + rois_dy = base.to_variable(rois_np) + rois_num_dy = base.to_variable(rois_num_np) + dy_eager_res = layers.roi_pool( + x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy) + dy_eager_res_value = dy_eager_res[0].numpy() + x_dy = base.to_variable(x_np) rois_dy = base.to_variable(rois_np) rois_num_dy = base.to_variable(rois_num_np) @@ -3288,6 +4116,7 @@ class TestBook(LayerTest): x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy) dy_res_value = dy_res[0].numpy() self.assertTrue(np.array_equal(static_res, dy_res_value)) + self.assertTrue(np.array_equal(static_res, dy_eager_res_value)) def test_sequence_enumerate(self): # TODO(minqiyang): dygraph do not support lod now @@ -3312,12 +4141,21 @@ class TestBook(LayerTest): fetch_list=[output])[0] with self.dynamic_graph(): + with _test_eager_guard(): + x_dy = base.to_variable(x_np) + rois_dy = base.to_variable(rois_np) + rois_num_dy = base.to_variable(rois_num_np) + dy_eager_res = layers.roi_align( + x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy) + dy_eager_res_value = dy_eager_res.numpy() + x_dy = base.to_variable(x_np) rois_dy = base.to_variable(rois_np) rois_num_dy = base.to_variable(rois_num_np) dy_res = layers.roi_align( x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy) dy_res_value = dy_res.numpy() + self.assertTrue(np.array_equal(static_res, dy_eager_res_value)) self.assertTrue(np.array_equal(static_res, dy_res_value)) def test_dice_loss(self): @@ -3338,11 +4176,18 @@ class TestBook(LayerTest): fetch_list=[output])[0] with self.dynamic_graph(): + with _test_eager_guard(): + input_ = base.to_variable(input_np) + label_ = base.to_variable(label_np) + dy_eager_res = layers.dice_loss(input_, label_, eps) + dy_eager_res_value = dy_eager_res.numpy() + input_ = base.to_variable(input_np) label_ = base.to_variable(label_np) dy_res = layers.dice_loss(input_, label_, eps) dy_res_value = dy_res.numpy() self.assertTrue(np.array_equal(static_res, dy_res_value)) + self.assertTrue(np.array_equal(static_res, dy_eager_res_value)) def test_roi_perspective_transform(self): # TODO(minqiyang): dygraph do not support lod now diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py index d13bdd676b48e38844e78469a2c36156b272f5e4..9e3edd82681bca1c7f29046a7761543ca7550d50 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py @@ -47,6 +47,7 @@ def test_dygraph_assert_true(self, x_list, p_list): def gen_input(): + np.random.seed(2021) # generate square matrix or batches of square matrices input_1 = np.random.rand(5, 5).astype('float32') input_2 = np.random.rand(3, 6, 6).astype('float64') diff --git a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py index 7ff6ebc0437b4c6b2e34492e91289bc11646a9ad..8d0a34009d6e589ec6cd14700faa869d63da31b2 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py @@ -36,6 +36,7 @@ class LinalgPinvTestCase(unittest.TestCase): def generate_input(self): self._input_shape = (5, 5) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -102,6 +103,7 @@ class LinalgPinvTestCase(unittest.TestCase): class LinalgPinvTestCase1(LinalgPinvTestCase): def generate_input(self): self._input_shape = (4, 5) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -109,6 +111,7 @@ class LinalgPinvTestCase1(LinalgPinvTestCase): class LinalgPinvTestCase2(LinalgPinvTestCase): def generate_input(self): self._input_shape = (5, 4) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -116,6 +119,7 @@ class LinalgPinvTestCase2(LinalgPinvTestCase): class LinalgPinvTestCaseBatch1(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -123,6 +127,7 @@ class LinalgPinvTestCaseBatch1(LinalgPinvTestCase): class LinalgPinvTestCaseBatch2(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 4, 5) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -130,6 +135,7 @@ class LinalgPinvTestCaseBatch2(LinalgPinvTestCase): class LinalgPinvTestCaseBatch3(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 4) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -137,6 +143,7 @@ class LinalgPinvTestCaseBatch3(LinalgPinvTestCase): class LinalgPinvTestCaseBatch4(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 6, 5, 4) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -144,6 +151,7 @@ class LinalgPinvTestCaseBatch4(LinalgPinvTestCase): class LinalgPinvTestCaseBatchBig(LinalgPinvTestCase): def generate_input(self): self._input_shape = (2, 200, 300) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -151,6 +159,7 @@ class LinalgPinvTestCaseBatchBig(LinalgPinvTestCase): class LinalgPinvTestCaseFP32(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -163,6 +172,7 @@ class LinalgPinvTestCaseFP32(LinalgPinvTestCase): class LinalgPinvTestCaseRcond(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) self._input_data = np.random.random(self._input_shape).astype( self.dtype) @@ -175,6 +185,7 @@ class LinalgPinvTestCaseRcond(LinalgPinvTestCase): class LinalgPinvTestCaseHermitian1(LinalgPinvTestCase): def generate_input(self): self._input_shape = (5, 5) + np.random.seed(123) x = np.random.random(self._input_shape).astype(self.dtype) + \ 1J * np.random.random(self._input_shape).astype(self.dtype) self._input_data = x + x.transpose().conj() @@ -188,6 +199,7 @@ class LinalgPinvTestCaseHermitian1(LinalgPinvTestCase): class LinalgPinvTestCaseHermitian2(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) x = np.random.random(self._input_shape).astype(self.dtype) + \ 1J * np.random.random(self._input_shape).astype(self.dtype) self._input_data = x + x.transpose((0, 2, 1)).conj() @@ -201,6 +213,7 @@ class LinalgPinvTestCaseHermitian2(LinalgPinvTestCase): class LinalgPinvTestCaseHermitian3(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) x = np.random.random(self._input_shape).astype(self.dtype) + \ 1J * np.random.random(self._input_shape).astype(self.dtype) self._input_data = x + x.transpose((0, 2, 1)).conj() @@ -214,6 +227,7 @@ class LinalgPinvTestCaseHermitian3(LinalgPinvTestCase): class LinalgPinvTestCaseHermitian4(LinalgPinvTestCase): def generate_input(self): self._input_shape = (5, 5) + np.random.seed(123) x = np.random.random(self._input_shape).astype(self.dtype) self._input_data = x + x.transpose() @@ -226,6 +240,7 @@ class LinalgPinvTestCaseHermitian4(LinalgPinvTestCase): class LinalgPinvTestCaseHermitian5(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) x = np.random.random(self._input_shape).astype(self.dtype) self._input_data = x + x.transpose((0, 2, 1)) @@ -238,6 +253,7 @@ class LinalgPinvTestCaseHermitian5(LinalgPinvTestCase): class LinalgPinvTestCaseHermitianFP32(LinalgPinvTestCase): def generate_input(self): self._input_shape = (3, 5, 5) + np.random.seed(123) x = np.random.random(self._input_shape).astype(self.dtype) self._input_data = x + x.transpose((0, 2, 1)) diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index efcc0e4cfe323294df88167a6100f019cef67005..ed1495c6352bb979058d1dca015171f013fd38d9 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -16,7 +16,8 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient +from paddle.fluid.tests.unittests.testsuite import create_op import paddle.fluid.core as core import paddle @@ -73,17 +74,32 @@ class TestMatMulV2Op(OpTest): self.init_kernel_type() self.config() self.op_type = "matmul_v2" - x = np.random.random(self.x_shape).astype(self.dtype) - y = np.random.random(self.y_shape).astype(self.dtype) - # -0.1 ~ 0.1 - x = -0.1 + 0.2 * x - y = -0.1 + 0.2 * y + if self.is_bfloat16_op(): + x = np.random.random(self.x_shape).astype(np.float32) + y = np.random.random(self.y_shape).astype(np.float32) + else: + x = np.random.random(self.x_shape).astype(self.dtype) + y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y result = reference_matmul(x, y, self.trans_x, self.trans_y) - result = result.astype(self.dtype) - self.inputs = { - 'X': x, - 'Y': y, - } + if self.is_bfloat16_op(): + result = result.astype(np.float32) + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y), + } + self.inputs_fp32 = { + 'X': x, + 'Y': y, + } + else: + result = result.astype(self.dtype) + self.inputs = { + 'X': x, + 'Y': y, + } self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y} self.outputs = {'Out': result} @@ -97,7 +113,7 @@ class TestMatMulV2Op(OpTest): self.check_grad(['X', 'Y'], 'Out') -class TestMatMuklOp2(TestMatMulV2Op): +class TestMatMulOp2(TestMatMulV2Op): """ case 2 """ @@ -109,7 +125,7 @@ class TestMatMuklOp2(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOp3(TestMatMulV2Op): +class TestMatMulOp3(TestMatMulV2Op): """ case 3 """ @@ -121,7 +137,7 @@ class TestMatMuklOp3(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp4(TestMatMulV2Op): +class TestMatMulOp4(TestMatMulV2Op): """ case 4 """ @@ -133,7 +149,7 @@ class TestMatMuklOp4(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp5(TestMatMulV2Op): +class TestMatMulOp5(TestMatMulV2Op): """ case 5 """ @@ -145,7 +161,7 @@ class TestMatMuklOp5(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp6(TestMatMulV2Op): +class TestMatMulOp6(TestMatMulV2Op): """ case 6 """ @@ -157,7 +173,7 @@ class TestMatMuklOp6(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp7(TestMatMulV2Op): +class TestMatMulOp7(TestMatMulV2Op): """ case 7 """ @@ -169,7 +185,7 @@ class TestMatMuklOp7(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp8(TestMatMulV2Op): +class TestMatMulOp8(TestMatMulV2Op): """ case 8 """ @@ -181,7 +197,7 @@ class TestMatMuklOp8(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp9(TestMatMulV2Op): +class TestMatMulOp9(TestMatMulV2Op): """ case 9 """ @@ -193,7 +209,7 @@ class TestMatMuklOp9(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOp10(TestMatMulV2Op): +class TestMatMulOp10(TestMatMulV2Op): """ case 10 """ @@ -205,7 +221,7 @@ class TestMatMuklOp10(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp11(TestMatMulV2Op): +class TestMatMulOp11(TestMatMulV2Op): """ case 11 """ @@ -217,7 +233,7 @@ class TestMatMuklOp11(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp12(TestMatMulV2Op): +class TestMatMulOp12(TestMatMulV2Op): """ case 12 """ @@ -229,7 +245,7 @@ class TestMatMuklOp12(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp13(TestMatMulV2Op): +class TestMatMulOp13(TestMatMulV2Op): """ case 13 """ @@ -241,7 +257,7 @@ class TestMatMuklOp13(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp14(TestMatMulV2Op): +class TestMatMulOp14(TestMatMulV2Op): """ case 14_1 """ @@ -253,7 +269,7 @@ class TestMatMuklOp14(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp15(TestMatMulV2Op): +class TestMatMulOp15(TestMatMulV2Op): """ case 14_2 """ @@ -265,7 +281,7 @@ class TestMatMuklOp15(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp16(TestMatMulV2Op): +class TestMatMulOp16(TestMatMulV2Op): """ case 16 : to check the gradient for special case """ @@ -277,7 +293,7 @@ class TestMatMuklOp16(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp17(TestMatMulV2Op): +class TestMatMulOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case """ @@ -289,7 +305,7 @@ class TestMatMuklOp17(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOpBroadcast1(TestMatMulV2Op): +class TestMatMulOpBroadcast1(TestMatMulV2Op): """ case 14_3 """ @@ -301,7 +317,7 @@ class TestMatMuklOpBroadcast1(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOpBroadcast2(TestMatMulV2Op): +class TestMatMulOpBroadcast2(TestMatMulV2Op): """ case 14_4 """ @@ -343,22 +359,90 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0): create_test_fp16_class(TestMatMulV2Op) -create_test_fp16_class(TestMatMuklOp2) -create_test_fp16_class(TestMatMuklOp3) -create_test_fp16_class(TestMatMuklOp4) -create_test_fp16_class(TestMatMuklOp5) -create_test_fp16_class(TestMatMuklOp6) -create_test_fp16_class(TestMatMuklOp7) -create_test_fp16_class(TestMatMuklOp8) -create_test_fp16_class(TestMatMuklOp9) -create_test_fp16_class(TestMatMuklOp10) -create_test_fp16_class(TestMatMuklOp11) -create_test_fp16_class(TestMatMuklOp12) -create_test_fp16_class(TestMatMuklOp13) -create_test_fp16_class(TestMatMuklOp14) -create_test_fp16_class(TestMatMuklOp15) -create_test_fp16_class(TestMatMuklOp16) -create_test_fp16_class(TestMatMuklOp17) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp7) +create_test_fp16_class(TestMatMulOp8) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) +create_test_fp16_class(TestMatMulOp14) +create_test_fp16_class(TestMatMulOp15) +create_test_fp16_class(TestMatMulOp16) +create_test_fp16_class(TestMatMulOp17) + +#--------------------test matmul bf16-------------------- + + +def create_test_bf16_class(parent, atol=0.01): + @unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0" + ) + class TestMatMulOpBf16Case(parent): + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op(scope, self.op_type, self.inputs, self.outputs, + self.attrs) + return get_numeric_gradient(place, scope, op, self.inputs_fp32, + check_name, ['Out']) + + def init_kernel_type(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=atol) + + def test_check_grad_x(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'X') + self.check_grad_with_place( + place, ['X'], + 'Out', + no_grad_set=set(['Y']), + user_defined_grads=[numeric_grads]) + + def test_check_grad_y(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Y') + self.check_grad_with_place( + place, ['Y'], + 'Out', + no_grad_set=set(['X']), + user_defined_grads=[numeric_grads]) + + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "Bf16") + TestMatMulOpBf16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpBf16Case + + +create_test_bf16_class(TestMatMulV2Op) +create_test_bf16_class(TestMatMulOp2) +create_test_bf16_class(TestMatMulOp3) +create_test_bf16_class(TestMatMulOp4) +create_test_bf16_class(TestMatMulOp5) +create_test_bf16_class(TestMatMulOp6) +create_test_bf16_class(TestMatMulOp7) +create_test_bf16_class(TestMatMulOp8) +create_test_bf16_class(TestMatMulOp9) +create_test_bf16_class(TestMatMulOp10) +create_test_bf16_class(TestMatMulOp11) +create_test_bf16_class(TestMatMulOp12) +create_test_bf16_class(TestMatMulOp13) +create_test_bf16_class(TestMatMulOp14) +create_test_bf16_class(TestMatMulOp15) +create_test_bf16_class(TestMatMulOp16) +create_test_bf16_class(TestMatMulOp17) class TestMatMulV2API(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index 4eae44846efc701d90a5a4ad03c6e0e29dad77c7..1ffcb3442812dcf5a6d6357e1b87dfdfb6d6e839 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -83,8 +83,8 @@ type_dict_str_to_numpy = { } xpu_test_op_white_list = [] -xpu_test_type_white_list = [] -xpu_test_op_type_white_list = ['float64'] +xpu_test_type_white_list = ['float64'] +xpu_test_op_type_white_list = [] xpu_test_device_op_white_list = [] xpu_test_device_op_type_white_list = [] @@ -186,7 +186,7 @@ def get_xpu_op_support_types(op_name, dev_id=0): paddle.bfloat16]) else: support_type_str_list.append(type_dict_paddle_to_str[stype]) - type_white_list = get_op_type_white_list() + type_white_list = get_type_white_list() return [ stype for stype in support_type_str_list if stype not in type_white_list ] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py index 5c77d6304302c982b11a1710c000dc5570e33f23..4290c0abf122ada6c8611a7738cc8ff108506fa4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py @@ -18,220 +18,79 @@ import numpy as np import unittest import sys sys.path.append("..") + +import paddle from op_test import OpTest from op_test_xpu import XPUOpTest -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core - -from paddle.fluid import ParamAttr -from paddle.fluid.framework import Program, grad_var_name -from paddle.fluid.executor import Executor -from paddle.fluid.backward import append_backward +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestArgsortOp(XPUOpTest): - def setUp(self): - self.set_xpu() - self.op_type = "argsort" - self.place = paddle.XPUPlace(0) - self.init_dtype() - self.init_inputshape() - self.init_axis() - self.init_direction() - - self.x = np.random.random(self.input_shape).astype(self.dtype) - self.inputs = {"X": self.x} - self.attrs = {"axis": self.axis, "descending": self.descending} - self.get_output() - self.outputs = {"Out": self.sorted_x, "Indices": self.indices} - - def get_output(self): - if self.descending: - self.indices = np.flip( - np.argsort( - self.x, kind='heapsort', axis=self.axis), self.axis) - self.sorted_x = np.flip( - np.sort( - self.x, kind='heapsort', axis=self.axis), self.axis) - else: - self.indices = np.argsort(self.x, kind='heapsort', axis=self.axis) - self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) - - def set_xpu(self): - self.__class__.use_xpu = True - self.__class__.no_need_check_grad = True - - def init_inputshape(self): - self.input_shape = (2, 2, 2, 3, 3) - - def init_dtype(self): - self.dtype = 'float32' - - def init_axis(self): - self.axis = -1 - - def test_check_output(self): - self.check_output_with_place(self.place) - - def init_direction(self): - self.descending = False - - -class TestArgsortOpAxis0XPU(TestArgsortOp): - def init_axis(self): - self.axis = 0 - - -class TestArgsortOpAxis1XPU(TestArgsortOp): - def init_axis(self): - self.axis = 1 - - -class TestArgsortOpAxis2XPU(TestArgsortOp): - def init_axis(self): - self.axis = 2 - - -class TestArgsortOpAxisNeg1XPU(TestArgsortOp): - def init_axis(self): - self.axis = -1 - - -class TestArgsortOpAxisNeg2XPU(TestArgsortOp): - def init_axis(self): - self.axis = -2 - - -class TestArgsortOpDescendingAxisXPU(TestArgsortOp): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxis0XPU(TestArgsortOpAxis0XPU): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxis1XPU(TestArgsortOpAxis1XPU): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxis2XPU(TestArgsortOpAxis2XPU): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxisNeg1XPU(TestArgsortOpAxisNeg1XPU): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxisNeg2XPU(TestArgsortOpAxisNeg2XPU): - def init_direction(self): - self.descending = True - - -class TestArgsortOpAxis0XPUINT64(TestArgsortOp): - def setUp(self): - self.set_xpu() - self.op_type = "argsort" - self.place = paddle.XPUPlace(0) - self.init_dtype() - self.init_inputshape() - self.init_axis() - self.init_direction() - - self.x = np.random.randint( - low=-1000, high=1000, size=self.input_shape).astype(self.dtype) - self.inputs = {"X": self.x} - self.attrs = {"axis": self.axis, "descending": self.descending} - self.get_output() - self.outputs = {"Out": self.sorted_x, "Indices": self.indices} - - def init_axis(self): - self.axis = 0 - - def init_dtype(self): - self.dtype = 'int64' - - -class TestArgsortOpAxis1XPUINT64(TestArgsortOpAxis0XPUINT64): - def init_axis(self): - self.axis = 1 - - -class TestArgsortOpAxis2XPUINT64(TestArgsortOpAxis0XPUINT64): - def init_axis(self): - self.axis = 2 - - -class TestArgsortOpAxisNeg1XPUINT64(TestArgsortOpAxis0XPUINT64): - def init_axis(self): - self.axis = -1 - - -class TestArgsortOpAxisNeg2XPUINT64(TestArgsortOpAxis0XPUINT64): - def init_axis(self): - self.axis = -2 - - -class TestArgsortOpDescendingAxisXPUINT64(TestArgsortOpAxis0XPUINT64): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxis0XPUINT64(TestArgsortOpAxis0XPUINT64): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxis1XPUINT64(TestArgsortOpAxis1XPUINT64): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxis2XPUINT64(TestArgsortOpAxis2XPUINT64): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxisNeg1XPUINT64(TestArgsortOpAxisNeg1XPUINT64): - def init_direction(self): - self.descending = True - - -class TestArgsortOpDescendingAxisNeg2XPUINT64(TestArgsortOpAxisNeg2XPUINT64): - def init_direction(self): - self.descending = True - - -class TestArgsortOpAxis0XPUINT(TestArgsortOp): - def setUp(self): - self.set_xpu() - self.op_type = "argsort" - self.place = paddle.XPUPlace(0) - self.init_dtype() - self.init_inputshape() - self.init_axis() - self.init_direction() - - self.x = np.random.randint( - low=-1000, high=1000, size=self.input_shape).astype(self.dtype) - self.inputs = {"X": self.x} - self.attrs = {"axis": self.axis, "descending": self.descending} - self.get_output() - self.outputs = {"Out": self.sorted_x, "Indices": self.indices} - - def init_axis(self): - self.axis = 0 - - def init_dtype(self): - self.dtype = 'int' - +class XPUTestArgsortOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'argsort' + self.use_dynamic_create_class = True + + def dynamic_create_class(self): + base_class = self.TestArgsortOp + classes = [] + for descending in [True, False]: + for axis in [0, 1, 2, -1, -2]: + class_name = 'XPUTestArgsortOp_axis_' + str(axis) + '_' + str( + descending) + attr_dict = {'init_axis': axis, 'init_descending': descending} + classes.append([class_name, attr_dict]) + return base_class, classes + + class TestArgsortOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "argsort" + self.place = paddle.XPUPlace(0) + self.dtype = self.in_type + self.input_shape = (2, 2, 2, 3, 3) + self.axis = -1 if not hasattr(self, 'init_axis') else self.init_axis + self.descending = False if not hasattr( + self, 'init_descending') else self.init_descending + + if self.dtype == np.float32: + self.x = np.random.random(self.input_shape).astype(self.dtype) + else: + self.x = np.random.randint( + low=-1000, high=1000, + size=self.input_shape).astype(self.dtype) + + self.inputs = {"X": self.x} + self.attrs = {"axis": self.axis, "descending": self.descending} + self.get_output() + self.outputs = {"Out": self.sorted_x, "Indices": self.indices} + + def get_output(self): + if self.descending: + self.indices = np.flip( + np.argsort( + self.x, kind='heapsort', axis=self.axis), + self.axis) + self.sorted_x = np.flip( + np.sort( + self.x, kind='heapsort', axis=self.axis), self.axis) + else: + self.indices = np.argsort( + self.x, kind='heapsort', axis=self.axis) + self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +support_types = get_xpu_op_support_types('argsort') +for stype in support_types: + create_test_class(globals(), XPUTestArgsortOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py index 0cd98d2daea2c432032da9cb9da0b977dd29ead8..30c91f87a245274f9144be5be35e8965448c2646 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py @@ -18,11 +18,13 @@ import unittest import numpy as np import sys sys.path.append("..") -from op_test import OpTest -from op_test_xpu import XPUOpTest + import paddle import paddle.fluid as fluid -from paddle.fluid import compiler, Program, program_guard + +from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() @@ -35,76 +37,81 @@ def huber_loss_forward(val, delta): return delta * (abs_val - 0.5 * delta) -class TestHuberLossOp(XPUOpTest): - def setUp(self): - self.set_xpu() - self.op_type = 'huber_loss' - self.place = paddle.XPUPlace(0) - - self.init_dtype() - - self.set_inputs() - self.set_attrs() - self.set_outputs() +class XPUTestHuberLossOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'huber_loss' + self.use_dynamic_create_class = False - def set_inputs(self): - shape = self.set_shape() - x = np.random.uniform(0, 1., shape).astype(self.dtype) - y = np.random.uniform(0, 1., shape).astype(self.dtype) - self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) - } + class TestHuberLossOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = 'huber_loss' + self.place = paddle.XPUPlace(0) - def set_attrs(self): - self.attrs = {'delta': 0.5} + self.init_dtype() + self.set_inputs() + self.set_attrs() + self.set_outputs() - def set_outputs(self): - delta = self.attrs['delta'] - shape = self.set_shape() - residual = self.inputs['Y'] - self.inputs['X'] - loss = np.vectorize(huber_loss_forward)(residual, - delta).astype(self.dtype) - self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)} + def set_inputs(self): + shape = self.set_shape() + x = np.random.uniform(0, 1., shape).astype(self.dtype) + y = np.random.uniform(0, 1., shape).astype(self.dtype) + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } - def set_shape(self): - return (100, 1) + def set_attrs(self): + self.attrs = {'delta': 0.5} - def set_xpu(self): - self.__class__.use_xpu = True + def set_outputs(self): + delta = self.attrs['delta'] + shape = self.set_shape() + residual = self.inputs['Y'] - self.inputs['X'] + loss = np.vectorize(huber_loss_forward)(residual, + delta).astype(self.dtype) + self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)} - def init_dtype(self): - self.dtype = np.float32 + def set_shape(self): + return (100, 1) - def test_check_output(self): - self.check_output_with_place(self.place) + def set_xpu(self): + self.__class__.use_xpu = True - def test_check_grad_normal(self): - self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + def init_dtype(self): + self.dtype = self.in_type - def test_check_grad_ingore_x(self): - self.check_grad_with_place( - self.place, ['Y'], 'Out', no_grad_set=set("residual")) + def test_check_output(self): + self.check_output_with_place(self.place) - def test_check_grad_ingore_y(self): - self.check_grad_with_place( - self.place, ['X'], 'Out', no_grad_set=set('residual')) + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], 'Out', no_grad_set=set("residual")) -def TestHuberLossOp1(TestHuberLossOp): - def set_shape(self): - return (64) + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', no_grad_set=set('residual')) + class TestHuberLossOp1(TestHuberLossOp): + def set_shape(self): + return (640) -def TestHuberLossOp2(TestHuberLossOp): - def set_shape(self): - return (6, 6) + class TestHuberLossOp2(TestHuberLossOp): + def set_shape(self): + return (10, 10) + class TestHuberLossOp3(TestHuberLossOp): + def set_shape(self): + return (10, 10, 1) -def TestHuberLossOp3(TestHuberLossOp): - def set_shape(self): - return (6, 6, 1) +support_types = get_xpu_op_support_types('huber_loss') +for stype in support_types: + create_test_class(globals(), XPUTestHuberLossOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py index 435026220c2b59a0f8df73f071673dab044e8348..45d60c8538e092f4c5d97f6525870af33a6ad9d5 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py @@ -97,7 +97,7 @@ class TestMatMulV2Op(XPUOpTest): self.check_grad_with_place(place, ['X', 'Y'], 'Out') -class TestMatMuklOp2(TestMatMulV2Op): +class TestMatMulOp2(TestMatMulV2Op): """ case 2 """ @@ -109,7 +109,7 @@ class TestMatMuklOp2(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp3(TestMatMulV2Op): +class TestMatMulOp3(TestMatMulV2Op): """ case 3 """ @@ -121,7 +121,7 @@ class TestMatMuklOp3(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp4(TestMatMulV2Op): +class TestMatMulOp4(TestMatMulV2Op): """ case 4 """ @@ -133,7 +133,7 @@ class TestMatMuklOp4(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp5(TestMatMulV2Op): +class TestMatMulOp5(TestMatMulV2Op): """ case 5 """ @@ -145,7 +145,7 @@ class TestMatMuklOp5(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp6(TestMatMulV2Op): +class TestMatMulOp6(TestMatMulV2Op): """ case 6 """ @@ -157,7 +157,7 @@ class TestMatMuklOp6(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp7(TestMatMulV2Op): +class TestMatMulOp7(TestMatMulV2Op): """ case 7 """ @@ -169,7 +169,7 @@ class TestMatMuklOp7(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp8(TestMatMulV2Op): +class TestMatMulOp8(TestMatMulV2Op): """ case 8 """ @@ -181,7 +181,7 @@ class TestMatMuklOp8(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp9(TestMatMulV2Op): +class TestMatMulOp9(TestMatMulV2Op): """ case 9 """ @@ -193,7 +193,7 @@ class TestMatMuklOp9(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOp10(TestMatMulV2Op): +class TestMatMulOp10(TestMatMulV2Op): """ case 10 """ @@ -205,7 +205,7 @@ class TestMatMuklOp10(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp11(TestMatMulV2Op): +class TestMatMulOp11(TestMatMulV2Op): """ case 11 """ @@ -217,7 +217,7 @@ class TestMatMuklOp11(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp12(TestMatMulV2Op): +class TestMatMulOp12(TestMatMulV2Op): """ case 12 """ @@ -229,7 +229,7 @@ class TestMatMuklOp12(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp13(TestMatMulV2Op): +class TestMatMulOp13(TestMatMulV2Op): """ case 13 """ @@ -241,7 +241,7 @@ class TestMatMuklOp13(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp14(TestMatMulV2Op): +class TestMatMulOp14(TestMatMulV2Op): """ case 14_1 """ @@ -253,7 +253,7 @@ class TestMatMuklOp14(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp15(TestMatMulV2Op): +class TestMatMulOp15(TestMatMulV2Op): """ case 14_2 """ @@ -265,7 +265,7 @@ class TestMatMuklOp15(TestMatMulV2Op): self.trans_y = True -class TestMatMuklOp16(TestMatMulV2Op): +class TestMatMulOp16(TestMatMulV2Op): """ case 16 : to check the big data """ @@ -277,7 +277,7 @@ class TestMatMuklOp16(TestMatMulV2Op): self.trans_y = False -class TestMatMuklOp17(TestMatMulV2Op): +class TestMatMulOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case """ @@ -289,7 +289,7 @@ class TestMatMuklOp17(TestMatMulV2Op): self.trans_y = False -# class TestMatMuklOpBroadcast1(TestMatMulV2Op): +# class TestMatMulOpBroadcast1(TestMatMulV2Op): # """ # case 14_3 # """ @@ -300,7 +300,7 @@ class TestMatMuklOp17(TestMatMulV2Op): # self.trans_x = True # self.trans_y = True -# class TestMatMuklOpBroadcast2(TestMatMulV2Op): +# class TestMatMulOpBroadcast2(TestMatMulV2Op): # """ # case 14_4 # """ diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py index 44137f4718743ccfe5290b0a53d7dd41312653a8..0830237d5a89d8397db129421158f143c79582fc 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py @@ -14,188 +14,196 @@ from __future__ import print_function -import unittest +import math import numpy as np import sys +import unittest sys.path.append("..") -import math + import paddle -from op_test import OpTest + from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestPriorBoxOp(XPUOpTest): - def set_data(self): - self.init_test_params() - self.init_test_input() - self.init_test_output() - self.inputs = {'Input': self.input, 'Image': self.image} - - self.attrs = { - 'min_sizes': self.min_sizes, - 'aspect_ratios': self.aspect_ratios, - 'variances': self.variances, - 'flip': self.flip, - 'clip': self.clip, - 'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order, - 'step_w': self.step_w, - 'step_h': self.step_h, - 'offset': self.offset - } - if len(self.max_sizes) > 0: - self.attrs['max_sizes'] = self.max_sizes - - self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - pass - - def setUp(self): - self.op_type = "prior_box" - self.use_xpu = True - self.set_data() - - def set_max_sizes(self): - max_sizes = [5, 10] - self.max_sizes = np.array(max_sizes).astype('float32').tolist() - - def set_min_max_aspect_ratios_order(self): - self.min_max_aspect_ratios_order = False - - def init_test_params(self): - self.layer_w = 32 - self.layer_h = 32 - - self.image_w = 40 - self.image_h = 40 - - self.step_w = float(self.image_w) / float(self.layer_w) - self.step_h = float(self.image_h) / float(self.layer_h) - - self.input_channels = 2 - self.image_channels = 3 - self.batch_size = 10 - - self.min_sizes = [2, 4] - self.min_sizes = np.array(self.min_sizes).astype('float32').tolist() - self.set_max_sizes() - self.aspect_ratios = [2.0, 3.0] - self.flip = True - self.set_min_max_aspect_ratios_order() - self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] - self.aspect_ratios = np.array( - self.aspect_ratios, dtype=np.float).flatten() - self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() - - self.clip = True - self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) - if len(self.max_sizes) > 0: - self.num_priors += len(self.max_sizes) - self.offset = 0.5 - - def init_test_input(self): - self.image = np.random.random( - (self.batch_size, self.image_channels, self.image_w, - self.image_h)).astype('float32') - - self.input = np.random.random( - (self.batch_size, self.input_channels, self.layer_w, - self.layer_h)).astype('float32') - - def init_test_output(self): - out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) - out_boxes = np.zeros(out_dim).astype('float32') - out_var = np.zeros(out_dim).astype('float32') - - idx = 0 - for h in range(self.layer_h): - for w in range(self.layer_w): - c_x = (w + self.offset) * self.step_w - c_y = (h + self.offset) * self.step_h - idx = 0 - for s in range(len(self.min_sizes)): - min_size = self.min_sizes[s] - if not self.min_max_aspect_ratios_order: - # rest of priors - for r in range(len(self.real_aspect_ratios)): - ar = self.real_aspect_ratios[r] - c_w = min_size * math.sqrt(ar) / 2 - c_h = (min_size / math.sqrt(ar)) / 2 - out_boxes[h, w, idx, :] = [ - (c_x - c_w) / self.image_w, (c_y - c_h) / - self.image_h, (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h - ] - idx += 1 - - if len(self.max_sizes) > 0: - max_size = self.max_sizes[s] - # second prior: aspect_ratio = 1, - c_w = c_h = math.sqrt(min_size * max_size) / 2 - out_boxes[h, w, idx, :] = [ - (c_x - c_w) / self.image_w, (c_y - c_h) / - self.image_h, (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h - ] - idx += 1 - else: - c_w = c_h = min_size / 2. - out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, - (c_y - c_h) / self.image_h, - (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h] - idx += 1 - if len(self.max_sizes) > 0: - max_size = self.max_sizes[s] - # second prior: aspect_ratio = 1, - c_w = c_h = math.sqrt(min_size * max_size) / 2 - out_boxes[h, w, idx, :] = [ - (c_x - c_w) / self.image_w, (c_y - c_h) / - self.image_h, (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h - ] - idx += 1 - - # rest of priors - for r in range(len(self.real_aspect_ratios)): - ar = self.real_aspect_ratios[r] - if abs(ar - 1.) < 1e-6: - continue - c_w = min_size * math.sqrt(ar) / 2 - c_h = (min_size / math.sqrt(ar)) / 2 +class XPUTestPriorBoxOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'prior_box' + self.use_dynamic_create_class = False + + class TestPriorBoxOp(XPUOpTest): + def setUp(self): + self.op_type = "prior_box" + self.use_xpu = True + self.dtype = self.in_type + self.set_data() + + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = {'Input': self.input, 'Image': self.image} + + self.attrs = { + 'min_sizes': self.min_sizes, + 'aspect_ratios': self.aspect_ratios, + 'variances': self.variances, + 'flip': self.flip, + 'clip': self.clip, + 'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order, + 'step_w': self.step_w, + 'step_h': self.step_h, + 'offset': self.offset + } + if len(self.max_sizes) > 0: + self.attrs['max_sizes'] = self.max_sizes + + self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def set_max_sizes(self): + max_sizes = [5, 10] + self.max_sizes = np.array(max_sizes).astype('float32').tolist() + + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = False + + def init_test_params(self): + self.layer_w = 32 + self.layer_h = 32 + + self.image_w = 40 + self.image_h = 40 + + self.step_w = float(self.image_w) / float(self.layer_w) + self.step_h = float(self.image_h) / float(self.layer_h) + + self.input_channels = 2 + self.image_channels = 3 + self.batch_size = 10 + + self.min_sizes = [2, 4] + self.min_sizes = np.array(self.min_sizes).astype('float32').tolist() + self.set_max_sizes() + self.aspect_ratios = [2.0, 3.0] + self.flip = True + self.set_min_max_aspect_ratios_order() + self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] + self.aspect_ratios = np.array( + self.aspect_ratios, dtype=np.float).flatten() + self.variances = [0.1, 0.1, 0.2, 0.2] + self.variances = np.array(self.variances, dtype=np.float).flatten() + + self.clip = True + self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) + if len(self.max_sizes) > 0: + self.num_priors += len(self.max_sizes) + self.offset = 0.5 + + def init_test_input(self): + self.image = np.random.random( + (self.batch_size, self.image_channels, self.image_w, + self.image_h)).astype(self.dtype) + + self.input = np.random.random( + (self.batch_size, self.input_channels, self.layer_w, + self.layer_h)).astype(self.dtype) + + def init_test_output(self): + out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) + out_boxes = np.zeros(out_dim).astype(self.dtype) + out_var = np.zeros(out_dim).astype(self.dtype) + + idx = 0 + for h in range(self.layer_h): + for w in range(self.layer_w): + c_x = (w + self.offset) * self.step_w + c_y = (h + self.offset) * self.step_h + idx = 0 + for s in range(len(self.min_sizes)): + min_size = self.min_sizes[s] + if not self.min_max_aspect_ratios_order: + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + else: + c_w = c_h = min_size / 2. out_boxes[h, w, idx, :] = [ (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h, (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h ] idx += 1 - - # clip the prior's coordidate such that it is within[0, 1] - if self.clip: - out_boxes = np.clip(out_boxes, 0.0, 1.0) - # set the variance. - out_var = np.tile(self.variances, (self.layer_h, self.layer_w, - self.num_priors, 1)) - self.out_boxes = out_boxes.astype('float32') - self.out_var = out_var.astype('float32') - - -class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp): - def set_max_sizes(self): - self.max_sizes = [] - - -class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp): - def set_min_max_aspect_ratios_order(self): - self.min_max_aspect_ratios_order = True - + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + if abs(ar - 1.) < 1e-6: + continue + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + # clip the prior's coordidate such that it is within[0, 1] + if self.clip: + out_boxes = np.clip(out_boxes, 0.0, 1.0) + # set the variance. + out_var = np.tile(self.variances, (self.layer_h, self.layer_w, + self.num_priors, 1)) + self.out_boxes = out_boxes.astype(self.dtype) + self.out_var = out_var.astype(self.dtype) + + class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp): + def set_max_sizes(self): + self.max_sizes = [] + + class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp): + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = True + + +support_types = get_xpu_op_support_types('prior_box') +for stype in support_types: + create_test_class(globals(), XPUTestPriorBoxOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index a0503322806e5825ca720740e93c07ecf6cb51fb..72e8e73ce7c2e51b9f7d1e38dba1098149ffcf89 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -29,6 +29,7 @@ from ..fluid.core import CUDAPlace # noqa: F401 from ..fluid.core import CUDAPinnedPlace # noqa: F401 from ..fluid.core import NPUPlace # noqa: F401 from ..fluid.core import MLUPlace # noqa: F401 +from ..fluid.core import CustomPlace # noqa: F401 from ..fluid.core import VarBase # noqa: F401 from paddle.fluid import core # noqa: F401 diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 711fd1e94cae9eff403de685f152d05a8fb52a31..8dc040325934f42eca30960fcd70abdfe87a11c9 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1676,7 +1676,7 @@ def cross_entropy(input, if label_max >= input.shape[axis]: raise ValueError("label should not out of bound, but got{}". format(label_max)) - if core.is_compiled_with_npu(): + if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): _, _, out = _C_ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', ignore_index, 'numeric_stable_mode', True, 'axis', axis, diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 79bacc0dfb6a7e714b292ded6f99889a43a3690b..9d55b8d1d2f12ac9a83cac33de014462173987e5 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -1358,7 +1358,7 @@ class ReduceOnPlateau(LRScheduler): self.last_epoch = epoch if _in_eager_mode(): - tmp = core.eager.EagerTensor + tmp = core.eager.Tensor else: tmp = Tensor # loss must be float, numpy.ndarray or 1-D Tensor with shape [1] diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index cd1faf64f3ea5cdddadcaa85cd68520b255d1db4..dd56b391d10ff8dc47abaa0dc963b49d4e7961a9 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -106,9 +106,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): if place is None: place = _current_expected_place() elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, - core.CUDAPlace, core.NPUPlace, core.XPUPlace)): + core.CUDAPlace, core.NPUPlace, core.XPUPlace, + core.CustomPlace)): raise ValueError( - "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace" + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace" ) #Todo(zhouwei): Support allocate tensor on any other specified card @@ -168,8 +169,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): # TOOD(jiabin): Support kwargs in eager tensor constructor if _in_eager_mode() and isinstance(data, np.ndarray): - return core.eager.EagerTensor(data, place, False, False, None, - stop_gradient) + return core.eager.Tensor(data, place, False, False, None, stop_gradient) else: return paddle.Tensor( value=data, diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py index af0f33f97ab4f59e79ce4d247d0e648147613283..0e76d92ca73ef35ede331d19683cbd6e22013141 100644 --- a/python/paddle/tensor/to_string.py +++ b/python/paddle/tensor/to_string.py @@ -263,7 +263,7 @@ def to_string(var, prefix='Tensor'): data=data) -def eager_tensor_to_string(tensor, prefix='Tensor'): +def tensor_to_string(tensor, prefix='Tensor'): indent = len(prefix) + 1 _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" diff --git a/python/paddle/tests/hapi_mnist_bf16_static.py b/python/paddle/tests/hapi_mnist_bf16_static.py new file mode 100644 index 0000000000000000000000000000000000000000..7eb4d61a21ee12c5357e8af911017b1523d78dba --- /dev/null +++ b/python/paddle/tests/hapi_mnist_bf16_static.py @@ -0,0 +1,126 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +from paddle import Model, set_device +from paddle.static import InputSpec as Input +from paddle.metric import Accuracy +from paddle.vision.datasets import MNIST +from paddle.vision.models import LeNet +import paddle.static.amp as amp +import random +from paddle import callbacks +import argparse +import ast + +SEED = 2 +paddle.seed(SEED) +paddle.framework.random._manual_program_seed(SEED) +np.random.seed(SEED) +random.seed(SEED) + +paddle.enable_static() +set_device('cpu') + + +def parse_args(): + parser = argparse.ArgumentParser("Lenet BF16 train static script") + parser.add_argument( + '-bf16', + '--bf16', + type=ast.literal_eval, + default=False, + help="whether use bf16") + args = parser.parse_args() + return args + + +class MnistDataset(MNIST): + def __init__(self, mode, return_label=True): + super(MnistDataset, self).__init__(mode=mode) + self.return_label = return_label + + def __getitem__(self, idx): + img = np.reshape(self.images[idx], [1, 28, 28]) + if self.return_label: + return img, np.array(self.labels[idx]).astype('int64') + return img, + + def __len__(self): + return len(self.images) + + +def compute_accuracy(pred, gt): + pred = np.argmax(pred, -1) + gt = np.array(gt) + + correct = pred[:, np.newaxis] == gt + + return np.sum(correct) / correct.shape[0] + + +def main(args): + print('download training data and load training data') + train_dataset = MnistDataset(mode='train', ) + val_dataset = MnistDataset(mode='test', ) + test_dataset = MnistDataset(mode='test', return_label=False) + + im_shape = (-1, 1, 28, 28) + batch_size = 64 + + inputs = [Input(im_shape, 'float32', 'image')] + labels = [Input([None, 1], 'int64', 'label')] + + model = Model(LeNet(), inputs, labels) + optim = paddle.optimizer.SGD(learning_rate=0.001) + if args.bf16: + optim = amp.bf16.decorate_bf16( + optim, + amp_lists=amp.bf16.AutoMixedPrecisionListsBF16( + custom_bf16_list={ + 'matmul_v2', 'pool2d', 'relu', 'scale', 'elementwise_add', + 'reshape2', 'slice', 'reduce_mean', 'conv2d' + }, )) + + # Configuration model + model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy()) + # Training model # + if args.bf16: + print('Training BF16') + else: + print('Training FP32') + model.fit(train_dataset, epochs=2, batch_size=batch_size, verbose=1) + eval_result = model.evaluate(val_dataset, batch_size=batch_size, verbose=1) + + output = model.predict( + test_dataset, batch_size=batch_size, stack_outputs=True) + + np.testing.assert_equal(output[0].shape[0], len(test_dataset)) + + acc = compute_accuracy(output[0], val_dataset.labels) + + print("acc", acc) + print("eval_result['acc']", eval_result['acc']) + + np.testing.assert_allclose(acc, eval_result['acc']) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 8b8b001739f3f8d652b6814c135225ec76f2743f..66411d00f1517c18c7eea820980305e5a70ec2e8 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1,5 +1,5 @@ - api : add - args : (const Tensor& x, const Tensor& y) + args : (Tensor x, Tensor y) output : Tensor infer_meta : func : ElementwiseInferMeta @@ -7,7 +7,7 @@ func : add - api : cast - args : (const Tensor& x, DataType out_dtype) + args : (Tensor x, DataType out_dtype) output : Tensor infer_meta : func : CastInferMeta @@ -18,7 +18,7 @@ - api : concat - args : (const std::vector& x, const Scalar& axis) + args : (Tensor[] x, Scalar axis) output : Tensor infer_meta : func : ConcatInferMeta @@ -27,7 +27,7 @@ func : concat - api : conj - args : (const Tensor& x) + args : (Tensor x) output : Tensor infer_meta : func : UnchangedInferMeta @@ -35,7 +35,7 @@ func : conj - api : divide - args : (const Tensor& x, const Tensor& y) + args : (Tensor x, Tensor y) output : Tensor infer_meta : func : ElementwiseInferMeta @@ -43,7 +43,7 @@ func : divide - api : dot - args : (const Tensor& x, const Tensor& y) + args : (Tensor x, Tensor y) output : Tensor infer_meta : func : DotInferMeta @@ -51,7 +51,7 @@ func : dot - api : empty - args : (const ScalarArray& shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW) + args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW) output: Tensor infer_meta : func : CreateInferMeta @@ -64,7 +64,7 @@ layout : layout - api : empty_like - args : (const Tensor& x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED) + args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED) output: Tensor infer_meta : func : CreateLikeInferMeta @@ -77,7 +77,7 @@ layout : layout > x - api : flatten - args : (const Tensor& x, int start_axis, int stop_axis) + args : (Tensor x, int start_axis, int stop_axis) output : Tensor infer_meta : func : FlattenInferMeta @@ -85,7 +85,7 @@ func : flatten - api : full - args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW) + args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW) output: Tensor infer_meta : func : CreateInferMeta @@ -98,7 +98,7 @@ layout : layout - api : full_like - args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED) + args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED) output: Tensor infer_meta : func : CreateLikeInferMeta @@ -111,7 +111,7 @@ layout : layout > x - api : matmul - args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false) + args : (Tensor x, Tensor y, bool transpose_x = false, bool transpose_y = false) output : Tensor infer_meta : func : MatmulInferMeta @@ -120,7 +120,7 @@ backward : matmul_grad - api : mean - args : (const Tensor& x, const std::vector& axis={}, bool keep_dim=false) + args : (Tensor x, int64_t[] axis={}, bool keep_dim=false) output : Tensor infer_meta : func : ReduceInferMeta @@ -128,7 +128,7 @@ func : mean - api : multiply - args : (const Tensor& x, const Tensor& y) + args : (Tensor x, Tensor y) output : Tensor infer_meta : func : ElementwiseInferMeta @@ -136,12 +136,12 @@ func : multiply - api : ones_like - args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED) + args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED) output : Tensor invoke : full_like(x, 1, dtype, place, layout) - api : reshape - args : (const Tensor& x, const ScalarArray& shape) + args : (Tensor x, ScalarArray shape) output : Tensor infer_meta : func : ReshapeInferMeta @@ -149,7 +149,7 @@ func : reshape - api : scale - args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale) + args : (Tensor x, Scalar scale, float bias, bool bias_after_scale) output : Tensor infer_meta : func : UnchangedInferMeta @@ -158,7 +158,7 @@ func : scale, scale_sr - api : sign - args : (const Tensor& x) + args : (Tensor x) output : Tensor infer_meta : func : UnchangedInferMeta @@ -166,7 +166,7 @@ func : sign - api : subtract - args : (const Tensor& x, const Tensor& y) + args : (Tensor x, Tensor y) output : Tensor infer_meta : func : ElementwiseInferMeta @@ -174,7 +174,7 @@ func : subtract - api : sum - args : (const Tensor& x, const std::vector& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) + args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) output : Tensor infer_meta : func : SumInferMeta @@ -184,6 +184,6 @@ data_type : x - api : zeros_like - args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED) + args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED) output : Tensor invoke : full_like(x, 0, dtype, place, layout) diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 2e1ed58e1c40b6794ed83b6e07183099ad04f00b..73c3ba4e4b4fe9d56ac6e6c7638777fc5df89164 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -71,23 +71,26 @@ class BaseAPI(object): f"Args declaration should start with '(' and end with ')', please check the args of {api_name} in yaml." args_str = args_str[1:-1] args_list = args_str.split(',') - input_types = [ - 'const Tensor&', 'const Tensor &', 'const std::vector&', - 'const std::vector &' - ] - attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \ - 'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \ - 'const std::vector&', 'Backend', 'DataLayout', 'DataType'] + input_types_map = { + 'Tensor': 'const Tensor&', + 'Tensor[]': 'const std::vector&' + } + attr_types_map = {'ScalarArray' : 'const ScalarArray&', 'Scalar' : 'const Scalar&', \ + 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \ + 'int64_t[]' : 'const std::vector&', 'int[]' : 'const std::vector&'} args_declare_str = "" args_define_str = "" for item in args_list: item = item.strip() + type_and_name = item.split(' ') # match the input tensor has_input = False - for in_type in input_types: - if item.startswith(in_type): - input_name = item[len(in_type):].strip() + for in_type_symbol, in_type in input_types_map.items(): + if type_and_name[0] == in_type_symbol: + input_name = type_and_name[1].strip() assert len(input_name) > 0, \ f"The input tensor name should not be empty. Please check the args of {api_name} in yaml." assert len(attrs['names']) == 0, \ @@ -103,9 +106,9 @@ class BaseAPI(object): continue # match the attribute - for attr_type in attr_types: - if item.startswith(attr_type): - attr_name = item[len(attr_type):].strip() + for attr_type_symbol, attr_type in attr_types_map.items(): + if type_and_name[0] == attr_type_symbol: + attr_name = item[len(attr_type_symbol):].strip() assert len(attr_name) > 0, \ f"The attribute name should not be empty. Please check the args of {api_name} in yaml." default_value = None @@ -128,25 +131,28 @@ class BaseAPI(object): def parse_output(self, api_name, output_config): def parse_output_item(output_item): - alllowd_output_types = ['Tensor', 'std::vector'] + output_type_map = { + 'Tensor': 'Tensor', + 'Tensor[]': 'std::vector' + } if re.search(r'\(\w*\)', output_item): result = re.search( - r"(?P[a-zA-Z0-9_<>]+)\s*\((?P\w+)\)", + r"(?P[a-zA-Z0-9_[\]]+)\s*\((?P\w+)\)", output_item) out_type = result.group('out_type') - assert out_type in alllowd_output_types, \ - f"{api_name} : Output type error: the output type only support Tensor and std::vector, \ + assert out_type in output_type_map, \ + f"{api_name} : Output type error: the output type only support Tensor and Tensor[], \ but now is {out_type}." return out_type, result.group('name') else: - if output_item.strip() in alllowd_output_types: - return output_item.strip(), 'out' + if output_item.strip() in output_type_map: + return output_type_map[output_item.strip()], 'out' else: raise ValueError( - "{} : Output type error: the output type only support Tensor and std::vector, \ - but now is {}.".format(api_name, out_type)) + "{} : Output type error: the output type only support Tensor and Tensor[], \ + but now is {}.".format(api_name, output_item.strip())) temp_list = output_config.split(',') diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index d14cf11c8dd7eaea2482e7a043c76530fc6fc7d7..62b724432e9283613f69852ec04eda55a88b0ab2 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1,33 +1,34 @@ - backward_api : matmul_grad - forward : matmul (const Tensor& x, const Tensor& y, bool transpose_x=false, bool transpose_y=false) -> Tensor(out) - args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false) + forward : matmul (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false) output : Tensor(x_grad), Tensor(y_grad) infer_meta : - func : MatmulGradInferMeta + func : GeneralBinaryGradInferMeta + param : [x, y] kernel : func : matmul_grad - backward_api : scale_grad - forward : scale (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale) -> Tensor(out) - args : (const Tensor& out_grad, const Scalar& scale, float bias=0.0, bool bias_after_scale=true) + forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out) + args : (Tensor out_grad, Scalar scale, float bias=0.0, bool bias_after_scale=true) output : Tensor(x_grad) invoke : scale(out_grad, scale, bias, bias_after_scale) # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future. # # - backward_api : matmul_double_grad -# forward : matmul_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x, bool transpose_y) -> tuple(dx, dy) -# args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y) -# output : tuple // d2x, d2y, dout_grad +# forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor>(dy) +# args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) +# output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad) # infer_meta : # func : MatmulDoubleGradInferMeta # kernel : # func : matmul_double_grad # - backward_api : matmul_triple_grad -# forward : matmul_double_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y) -> tuple(d2x, d2y, dout_grad) -# args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, const Tensor& d2x_grad, const Tensor& d2y_grad, const Tensor& dout_grad_grad, bool transpose_x, bool transpose_y) -# output : tuple // d3x, d3y, d2out_grad, ddx_grad, ddy_grad +# forward : matmul_double_grad (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -> Tensor(d2x), Tensor(d2y), Tensor(dout_grad) +# args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, Tensor d2x_grad, Tensor d2y_grad, Tensor dout_grad_grad, bool transpose_x, bool transpose_y) +# output : Tensor(d3x), Tensor(d3y), Tensor(d2out_grad), Tensor(ddx_grad), Tensor(ddy_grad) # infer_meta : # func : MatmulTripleGradInferMeta # kernel : diff --git a/python/setup.py.in b/python/setup.py.in index 8f42beaf1c09b5e9d23946fb6436151590868072..9977ddeb26b17f6e69dbd49b782ff50490ab55a5 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -579,7 +579,8 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) + # pten core headers # utila api headers ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] + - ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h']) + ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] + + ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h']) if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn @@ -624,6 +625,8 @@ class InstallHeaders(Command): elif 'third_party' not in header: # paddle headers install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) + if 'device_ext.h' in header: + install_dir = "paddle/" else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 4fd4e809cacbec0e65223a200f56b26c7f34e6b6..a36f173454f6a57bc9407b7b56042de5a14e32a7 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -262,15 +262,17 @@ if [ "${PTEN_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/pten files(${PTEN_INCLUDE_FLUID_FILES}).\n" check_approval 1 chenwhql MingMingShangTian YuanRisheng zyfncg fi + +HAS_MODIFIED_PTEN_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/pten/kernels" || true` PTEN_USE_MUTABLE_DATA_FILES="" -for CHANGE_FILE in ${HAS_MODIFIED_PTEN_FILES}; do +for CHANGE_FILE in ${HAS_MODIFIED_PTEN_KERNEL_FILES}; do PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true` if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then PTEN_USE_MUTABLE_DATA_FILES="${PTEN_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}" fi done if [ "${PTEN_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You can not use the DenseTensor::mutable_data() method in paddle/pten files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use pten::DeviceContext::Alloc() or pten::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n" + echo_line="You can not use the DenseTensor::mutable_data() method in paddle/pten/kernels files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use pten::DeviceContext::Alloc() or pten::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n" check_approval 1 chenwhql Shixiaowei02 MingMingShangTian YuanRisheng zyfncg fi diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh index b77f3eb00ff06eedbf6fa14d8dbb1078d2375770..f1221f058bc6dda17d7c25a0e4c74d47143c23e9 100644 --- a/tools/coverage/paddle_coverage.sh +++ b/tools/coverage/paddle_coverage.sh @@ -48,6 +48,7 @@ function gen_full_html_report() { '/paddle/paddle/fluid/operators/*' \ '/paddle/paddle/fluid/recordio/*' \ '/paddle/paddle/fluid/string/*' \ + '/paddle/paddle/pten/*' \ -o coverage-full.tmp \ --rc lcov_branch_coverage=0 @@ -59,6 +60,7 @@ function gen_full_html_report() { '/paddle/paddle/fluid/*/*/*test*' \ '/paddle/paddle/fluid/inference/tests/*' \ '/paddle/paddle/fluid/inference/api/demo_ci/*' \ + '/paddle/paddle/pten/tests/*' \ -o coverage-full.tmp \ --rc lcov_branch_coverage=0 diff --git a/tools/infrt/get_pten_kernel_function.sh b/tools/infrt/get_pten_kernel_function.sh new file mode 100755 index 0000000000000000000000000000000000000000..0d787d9930b2c739733e8431eaccece88519248a --- /dev/null +++ b/tools/infrt/get_pten_kernel_function.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#================================================= +# Utils +#================================================= + +set -e + +#step 1:get kernel registered info +kernel_register_info_file=`mktemp` +PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" +unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/pten/kernels -name "*.c*" \ + | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \ + | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' \ + | grep PT_REGISTER \ + | awk -F ",|\(" '{gsub(/ /,"");print $2, $3, $4, $5}' \ + | sort -u | awk '{gsub(/pten::/,"");print $0}' \ + | grep -v "_grad" > $kernel_register_info_file + +#step 2:get simple general inferMeta function wrap info +temp_path=`mktemp -d` +python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ + --api_yaml_path ${PADDLE_ROOT}/python/paddle/utils/code_gen/api.yaml \ + --wrapped_infermeta_header_path ${temp_path}/generate.h \ + --wrapped_infermeta_source_path ${temp_path}/generate.cc + +grep PT_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ + | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt + +#step 3: merge all infos +# @input1 => pten kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout) +# @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name +# @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has +# same signature with kernel function +python3 ${PADDLE_ROOT}/tools/infrt/get_pten_kernel_info.py \ + --paddle_root_path ${PADDLE_ROOT} \ + --kernel_info_file $kernel_register_info_file \ + --infermeta_wrap_file ${temp_path}/wrap_info.txt diff --git a/paddle/scripts/get_pten_kernel_info.py b/tools/infrt/get_pten_kernel_info.py similarity index 73% rename from paddle/scripts/get_pten_kernel_info.py rename to tools/infrt/get_pten_kernel_info.py index 5575fac41fe3d30f7d2119eaece023f5da25fef1..e311464130008e9c7815c028f69b2d29eef3b349 100644 --- a/paddle/scripts/get_pten_kernel_info.py +++ b/tools/infrt/get_pten_kernel_info.py @@ -31,6 +31,11 @@ def parse_args(): type=str, required=True, help="kernel info file generated by get_pten_kernel_function.sh .") + parser.add_argument( + "--infermeta_wrap_file", + type=str, + required=True, + help="inferMeta wrap info file .") args = parser.parse_args() return args @@ -47,17 +52,24 @@ def get_kernel_info(file_path): return [l.strip() for l in cont] -def merge(infer_meta_data, kernel_data): +def merge(infer_meta_data, kernel_data, wrap_data): meta_map = {} for api in infer_meta_data: - if not api.has_key("kernel") or not api.has_key("infer_meta"): + if "kernel" not in api or "infer_meta" not in api: continue meta_map[api["kernel"]["func"]] = api["infer_meta"]["func"] + wrap_map = {} + for l in wrap_data: + wrap_map[l.split()[0]] = l.split()[1] + full_kernel_data = [] for l in kernel_data: key = l.split()[0] - if meta_map.has_key(key): - full_kernel_data.append((l + " " + meta_map[key]).split()) + if key in meta_map: + if key in meta_map: + full_kernel_data.append((l + " " + wrap_map[key]).split()) + else: + full_kernel_data.append((l + " " + meta_map[key]).split()) else: full_kernel_data.append((l + " unknown").split()) @@ -68,5 +80,6 @@ if __name__ == "__main__": args = parse_args() infer_meta_data = get_api_yaml_info(args.paddle_root_path) kernel_data = get_kernel_info(args.kernel_info_file) - out = merge(infer_meta_data, kernel_data) + info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file) + out = merge(infer_meta_data, kernel_data, info_meta_wrap_data) print(json.dumps(out))