diff --git a/README.md b/README.md index f058b20e1f0322b2fabd39bca1f61dc6f2a4a552..78396610c07933122675a8f9a9fee99007213135 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ English | [简体中文](./README_cn.md) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) +[![Twitter](https://img.shields.io/badge/Twitter-1ca0f1.svg?logo=twitter&logoColor=white)](https://twitter.com/PaddlePaddle_) Welcome to the PaddlePaddle GitHub. diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index c7a4e1d99bff16feff685c6da98ef72cdd9d89b7..aa8ab62d7ae28197e9f18b83440f75c8c68d8fff 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -23,7 +23,7 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) # in case of low internet speed #set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) +set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184) set(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" diff --git a/cmake/operators.cmake b/cmake/operators.cmake index e626669d2a73af1cb7125663a78c128d62ba28ae..8e08eb84b9f3577dc072f8ffe31d231e03cfea90 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -26,7 +26,7 @@ function(find_register FILENAME PATTERN OUTPUT) PARENT_SCOPE) endfunction() -function(find_phi_register FILENAME ADD_PATH) +function(find_phi_register FILENAME ADD_PATH PATTERN) # set op_name to OUTPUT set(options "") set(oneValueArgs "") @@ -36,11 +36,11 @@ function(find_phi_register FILENAME ADD_PATH) string( REGEX MATCH - "PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*" + "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*" register "${CONTENT}") if(NOT register STREQUAL "") - string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}") + string(REPLACE "${PATTERN}(" "" register "${register}") string(REPLACE "," ";" register "${register}") string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}") string(REGEX REPLACE "//cuda_only" "" register "${register}") @@ -401,7 +401,8 @@ function(op_library TARGET) # pybind USE_OP_ITSELF set(op_name "") # Add PHI Kernel Registry Message - find_phi_register(${cc_src} ${pybind_file}) + find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_KERNEL") + find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL") find_register(${cc_src} "REGISTER_OPERATOR" op_name) if(NOT ${op_name} EQUAL "") file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n") @@ -440,7 +441,8 @@ function(op_library TARGET) foreach(cu_src ${cu_srcs}) set(op_name "") # Add PHI Kernel Registry Message - find_phi_register(${cu_src} ${pybind_file}) + find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL") + find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL") find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name) if(NOT ${op_name} EQUAL "") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 6fac4d2c64080d7af7d200696c4430dcf91f96f0..602f966cf8ebc2c88d87903b956faa1bc40107e5 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -446,7 +446,8 @@ endif() if(WITH_DISTRIBUTE AND NOT WITH_PSLIB - AND NOT WITH_PSCORE) + AND NOT WITH_PSCORE + AND NOT WITH_RPC) include(external/snappy) list(APPEND third_party_deps extern_snappy) diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc index 5a8d4bca1cc77b15a9c31f3fc2e1207949f615d5..de4a84bff4808f96ba5fecb3bb7f1fece54c3870 100644 --- a/paddle/fluid/distributed/collective/process_group_bkcl.cc +++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/distributed/collective/bkcl_tools.h" #include "paddle/fluid/distributed/collective/common.h" +#include "paddle/fluid/distributed/collective/utils.h" #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/phi/core/device_context.h" @@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() { PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_end()); } +std::shared_ptr ProcessGroupBKCL::Recv( + phi::DenseTensor* tensor, + int src_rank, + int64_t offset, + int64_t numel, + bool sync_op, + bool use_calc_stream) { + // numel > 0 indicates the tensor need to be sliced + phi::DenseTensor partial_tensor; + if (numel > 0) { + partial_tensor = GetPartialTensor(*tensor, offset, numel); + tensor = &partial_tensor; + } + + return Collective( + tensor, + // have to pass a tensor here + // TODO(zhangxiaoci) catch up with nccl's api + *tensor, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + BKCLContext_t comm, + const XPUStream& stream) { + return bkcl_recv(comm, + output->data(), + output->numel(), + src_rank, + platform::ToBKCLDataType( + framework::TransToProtoVarType(output->type())), + stream); + }, + CommType::RECV, + sync_op, + use_calc_stream); +} + +std::shared_ptr ProcessGroupBKCL::Send( + const phi::DenseTensor& tensor, + int dst_rank, + int64_t offset, + int64_t numel, + bool sync_op, + bool use_calc_stream) { + // numel > 0 indicates the tensor need to be sliced + const phi::DenseTensor& tensor_maybe_partial = + numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor; + + return Collective( + nullptr, + tensor_maybe_partial, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + BKCLContext_t comm, + const XPUStream& stream) { + return bkcl_send(comm, + input.data(), + input.numel(), + dst_rank, + platform::ToBKCLDataType( + framework::TransToProtoVarType(input.type())), + stream); + }, + CommType::SEND, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupBKCL::CreateTask( const Place& place, int rank, @@ -136,6 +204,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place, BKCLContext_t bkcl_comm; BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id)); comm_ctx->SetBkclContext(bkcl_comm); + // comm context creates a separate XPU stream for communication + comm_ctx->CreateStream(); place_to_calc_ctx_[place_key] = calc_ctx; place_to_comm_ctx_[place_key] = std::move(comm_ctx); diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.h b/paddle/fluid/distributed/collective/process_group_bkcl.h index cf8c983d8e66a884a54d2426b03685102140ebfb..1ecf8c9c0ff96af52038fa6be540263de913297b 100644 --- a/paddle/fluid/distributed/collective/process_group_bkcl.h +++ b/paddle/fluid/distributed/collective/process_group_bkcl.h @@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream { phi::DeviceContext* GetDeviceContext(const Place& place, bool use_calc_stream) const override; - std::shared_ptr AllReduce( + std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - const AllreduceOptions& opts, + int64_t offset, // for compatibility, no use now + int64_t numel, // for compatibility, no use now bool sync_op, bool use_calc_stream) override; - std::shared_ptr Broadcast( + std::shared_ptr AllReduce( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - const BroadcastOptions& opts, + const AllreduceOptions& opts, bool sync_op, bool use_calc_stream) override; - std::shared_ptr AllGather( + std::shared_ptr Broadcast( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - int64_t offset, // for compatibility, no use now - int64_t numel, // for compatibility, no use now + const BroadcastOptions& opts, bool sync_op, bool use_calc_stream) override; @@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream { bool sync_op, bool use_calc_stream) override; + std::shared_ptr Recv(phi::DenseTensor* tensor, + int src_rank, + int64_t offset, + int64_t numel, + bool sync_op, + bool use_calc_stream) override; + + std::shared_ptr Send(const phi::DenseTensor& tensor, + int dst_rank, + int64_t offset, + int64_t numel, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc index 72c689732b5b7df5f61d28d93a3bef6e305f426d..a166ff0b6dfa2f381da02ff0e90dadc08732de5e 100644 --- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc @@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() { // run_per_steps_, run_at_offset_ // 4, 0 --> run at step 0, 4, 8, 12 // 4, 3 --> run at step 3, 7, 11, 15 - if ((step_ % run_per_steps_) == run_at_offset_) { + if ((cur_scope_id_ % run_per_steps_) == run_at_offset_) { ComputeInterceptor::RunOps(); } } @@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() { void AmplifierInterceptor::SendDataReadyToDownStream() { // run multi times, send ready one times to downstream, that is // input multi times, output one times - if (step_ % send_down_per_steps_ == 0) { + if (cur_scope_id_ % send_down_per_steps_ == 0) { ComputeInterceptor::SendDataReadyToDownStream(); } } @@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() { void AmplifierInterceptor::ReplyCompletedToUpStream() { // run multi times, reply one times to upstream, that is // input one times, output multi times - if (step_ % reply_up_per_steps_ == 0) { + if (cur_scope_id_ % reply_up_per_steps_ == 0) { ComputeInterceptor::ReplyCompletedToUpStream(); } } diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h index 776aa8d3e88db10d551d6fd0180a5da9d6a6f3db..93e8ffa1d75aecc063b05fff84545238e7a1fba2 100644 --- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h @@ -21,7 +21,7 @@ namespace paddle { namespace distributed { -class AmplifierInterceptor : public ComputeInterceptor { +class AmplifierInterceptor final : public ComputeInterceptor { public: AmplifierInterceptor(int64_t interceptor_id, TaskNode* node); diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 6fb0d55a4859ef39d04857d39d1e70f6a31bb4a3..3449c87998a9dba21824e854afdb7216cb818164 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -71,6 +71,9 @@ void Carrier::Init( microbatch_scopes_[i] = &minibatch_scope_->NewScope(); CopyParameters(i, program, inference_root_scope_vars); } + // Add source and sink interceptor id to rank + interceptor_id_to_rank_.emplace(SOURCE_ID, rank); + interceptor_id_to_rank_.emplace(SINK_ID, rank); // TODO(fleet_exe dev): thread pool thread_num_ = 1; @@ -159,16 +162,10 @@ void Carrier::Start() { true, platform::errors::PreconditionNotMet( "Using carrier before initialized.")); - for (int64_t id : source_interceptor_ids_) { - VLOG(3) << "Carrier Start is sending start to source interceptor " << id - << "."; - InterceptorMessage start_msg; - // source node data_is_ready is send by carrier, so set src_id=-1 - start_msg.set_src_id(-1); - start_msg.set_dst_id(id); - start_msg.set_message_type(DATA_IS_READY); - Send(start_msg); - } + InterceptorMessage start_msg; + start_msg.set_dst_id(SOURCE_ID); + start_msg.set_message_type(START); + Send(start_msg); // TODO(wangxi): async step Wait(); dev_ctx_->Wait(); @@ -270,6 +267,38 @@ void Carrier::CreateInterceptors() { auto gc = GetGC(place_); + // create source and sink task node + auto max_run_times = microbatch_scopes_.size(); + TaskNode* source = new TaskNode( + rank_, SOURCE_ID, max_run_times); // rank, task_id, max_run_times + TaskNode* sink = new TaskNode(rank_, SINK_ID, max_run_times); + // find nodes without upstreams or without downstreams + std::vector origin_sources, origin_sinks; + for (const auto& item : interceptor_id_to_node_) { + TaskNode* task_node = item.second; + if (task_node->upstream().empty()) { + origin_sources.emplace_back(task_node); + } + if (task_node->downstream().empty()) { + origin_sinks.emplace_back(task_node); + } + } + // link source node with origin source + for (const auto& node : origin_sources) { + source->AddDownstreamTask(node->task_id(), + std::numeric_limits::max()); + node->AddUpstreamTask(SOURCE_ID, std::numeric_limits::max()); + } + // link sink node with origin sink + for (const auto& node : origin_sinks) { + sink->AddUpstreamTask(node->task_id(), std::numeric_limits::max()); + node->AddDownstreamTask(SINK_ID, std::numeric_limits::max()); + } + // create source and sink interceptor + SetInterceptor(SOURCE_ID, + InterceptorFactory::Create("Source", SOURCE_ID, source)); + SetInterceptor(SINK_ID, InterceptorFactory::Create("Sink", SINK_ID, sink)); + // create each Interceptor // no auto init since there is no config for (const auto& item : interceptor_id_to_node_) { @@ -303,9 +332,15 @@ void Carrier::CreateInterceptors() { VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id << " with type: " << task_node->type() << "."; - if (task_node->upstream().empty()) { - source_interceptor_ids_.emplace_back(interceptor_id); - } + PADDLE_ENFORCE_EQ( + task_node->upstream().empty(), + false, + platform::errors::PreconditionNotMet( + "There should not have normal nodes as source nodes")); + PADDLE_ENFORCE_EQ(task_node->downstream().empty(), + false, + platform::errors::PreconditionNotMet( + "There should not have normal nodes as sink nodes")); } } diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index fe3d492676655838f6f077718ef65681bcdb53cb..2523942e06223f6210461a625a1a3bce2dcedb92 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -100,8 +100,6 @@ class Carrier final { std::unordered_map> interceptor_idx_to_interceptor_; - std::vector source_interceptor_ids_; - bool is_init_{false}; std::mutex running_mutex_; diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc index 5b96ee76e7144692bad974c14a2bce1f6ae2f3b4..5017f81523c8aea31fb8732e001e4af311313d32 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc @@ -34,29 +34,10 @@ void ComputeInterceptor::PrepareDeps() { for (auto up : upstream) { in_readys_.emplace(up.first, std::make_pair(up.second, 0)); - in_stops_.emplace(up.first, false); } for (auto down : downstream) { out_buffs_.emplace(down.first, std::make_pair(down.second, 0)); } - - // source compute node, should we add a new SourceInterceptor? - if (upstream.empty()) { - is_source_ = true; - PADDLE_ENFORCE_GT(node_->max_run_times(), - 0, - platform::errors::InvalidArgument( - "Source ComputeInterceptor must run at least one " - "times, but now max_run_times=%ld", - node_->max_run_times())); - in_readys_.emplace(-1, - std::make_pair(std::numeric_limits::max(), 0)); - } - - // If there is no downstream or every downstream is in different rank, - // then this interceptor is the last one for current rank. - // This can be get during init, can be cached for later use. - is_last_ = downstream.empty(); } void ComputeInterceptor::IncreaseReady(int64_t up_id) { @@ -66,12 +47,6 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) { platform::errors::NotFound( "Cannot find upstream=%lld in in_readys.", up_id)); - // source node has no upstream, data_is_ready is send by carrier or others - if (is_source_ && up_id == -1) { - it->second.second += GetTaskNode()->max_run_times(); - return; - } - auto max_ready_size = it->second.first; auto ready_size = it->second.second; ready_size += 1; @@ -152,7 +127,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() { ready_msg.set_message_type(DATA_IS_READY); VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " Send data_is_ready msg to " << down_id - << " for step: " << step_; + << " in scope: " << cur_scope_id_; Send(down_id, ready_msg); } } @@ -173,8 +148,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " Reply data_is_useless msg to " << up_id - << " for step: " << step_; - if (is_source_ && up_id == -1) return; + << " in scope: " << cur_scope_id_; InterceptorMessage reply_msg; reply_msg.set_message_type(DATA_IS_USELESS); @@ -183,16 +157,20 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { } void ComputeInterceptor::RunOps() { - VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the " - << step_ + 1 << " time."; for (auto op : node_->ops()) { - op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_); + PADDLE_ENFORCE_LT(cur_scope_id_, + microbatch_scopes_.size(), + platform::errors::InvalidArgument( + "Step out of range. There are %ld " + "microbatch_scopes, but recevice scope index %ld", + microbatch_scopes_.size(), + cur_scope_id_)); + op->Run(*microbatch_scopes_[cur_scope_id_], place_); if (gc_) { - framework::DeleteUnusedTensors( - *microbatch_scopes_[step_ % node_->max_run_times()], - op, - node_->unused_vars(), - gc_.get()); + framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_], + op, + node_->unused_vars(), + gc_.get()); } } } @@ -201,77 +179,28 @@ void ComputeInterceptor::Run() { while (IsInputReady() && CanWriteOutput()) { VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running"; + // get the ready scope id from queue + cur_scope_id_ = ready_queue_.front(); + ready_queue_.pop(); + RunOps(); - ++step_; // send to downstream and increase buff used SendDataReadyToDownStream(); // reply to upstream and decrease ready data ReplyCompletedToUpStream(); - // Try to stop Carrier - if (is_last_ && (step_ % node_->max_run_times() == 0)) { - VLOG(3) << "Interceptor " << GetInterceptorId() - << " is stopping carrier."; - // FIXME(wangxi): with multi sink interceptor - StopCarrier(); - } - } -} - -void ComputeInterceptor::ReceivedStop(int64_t up_id) { - received_stop_ = true; - - // source node has no upstream, stop is send by carrier or others - if (is_source_ && up_id == -1) return; - - auto it = in_stops_.find(up_id); - PADDLE_ENFORCE_NE(it, - in_stops_.end(), - platform::errors::NotFound( - "Cannot find upstream=%lld in in_stops.", up_id)); - PADDLE_ENFORCE_EQ( - it->second, - false, - platform::errors::AlreadyExists("Already received stop from %lld, stop " - "cannot be send more than once.")); - it->second = true; -} - -void ComputeInterceptor::TryStop() { - if (!received_stop_) return; - - // can stop only when all upstream is stop and - // downstream complete - for (auto& in_stop : in_stops_) { - if (!in_stop.second) return; - } - for (auto& out_buff : out_buffs_) { - auto used_size = out_buff.second.second; - if (used_size != 0) return; } - - // send stop to downstream - for (auto& out : out_buffs_) { - auto down_id = out.first; - InterceptorMessage stop; - stop.set_message_type(STOP); - Send(down_id, stop); - } - stop_ = true; } void ComputeInterceptor::Compute(const InterceptorMessage& msg) { if (msg.message_type() == DATA_IS_READY) { IncreaseReady(msg.src_id()); + ready_queue_.push(msg.scope_idx()); Run(); } else if (msg.message_type() == DATA_IS_USELESS) { DecreaseBuff(msg.src_id()); Run(); - } else if (msg.message_type() == STOP) { - ReceivedStop(msg.src_id()); } - - TryStop(); } REGISTER_INTERCEPTOR(Compute, ComputeInterceptor); diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h index fb82ce76c7bdb851c32b1959121059cfca041b94..9709cd4437f1019fea80cf04ecce5a38f74bb463 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/distributed/fleet_executor/interceptor.h" @@ -30,7 +31,8 @@ class ComputeInterceptor : public Interceptor { virtual void SendDataReadyToDownStream(); virtual void ReplyCompletedToUpStream(); - int64_t step_{0}; + std::queue ready_queue_; + int64_t cur_scope_id_; private: void PrepareDeps(); @@ -43,19 +45,10 @@ class ComputeInterceptor : public Interceptor { void Run(); void Compute(const InterceptorMessage& msg); - void ReceivedStop(int64_t up_id); - void TryStop(); - - bool is_source_{false}; - bool is_last_{false}; - // upstream_id-->(max_ready_size, ready_size) std::map> in_readys_{}; // downstream_id-->(max_buffer_size, used_size) std::map> out_buffs_{}; - - bool received_stop_{false}; - std::map in_stops_{}; }; } // namespace distributed diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h index 6a761072027a924f21b38f7a694bba65b77e425d..2c20e1ad6113ecda58404429697fa4077fece492 100644 --- a/paddle/fluid/distributed/fleet_executor/interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/interceptor.h @@ -93,7 +93,6 @@ class Interceptor { TaskNode* node_; // for stop - bool stop_{false}; void StopCarrier(); // for runtime @@ -114,9 +113,6 @@ class Interceptor { std::mutex mutex_; std::deque messages_; - - int64_t already_run_times_{0}; - int64_t used_slot_nums_{0}; }; class InterceptorFactory { diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h index cb1d698a78526fdde61586304e588e8009340584..1abb7a641e23a5237570b9f469009f4fa3fb72a7 100644 --- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h @@ -25,7 +25,7 @@ namespace distributed { * 1. record the num of micro-step * 2. check whether to notify carrier the current step is finished */ -class SinkInterceptor : public Interceptor { +class SinkInterceptor final : public Interceptor { public: SinkInterceptor(int64_t interceptor_id, TaskNode* node); diff --git a/paddle/fluid/distributed/fleet_executor/source_interceptor.h b/paddle/fluid/distributed/fleet_executor/source_interceptor.h index f8b18fb1848645c44c75db90a7d123ba48aeae21..95e8c1b3b03781a653152219a73e6b590cced631 100644 --- a/paddle/fluid/distributed/fleet_executor/source_interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.h @@ -25,7 +25,7 @@ namespace distributed { * 1. receive `start` message from carrier * 2. send num_of_steps `data_is_ready` message to downstream */ -class SourceInterceptor : public Interceptor { +class SourceInterceptor final : public Interceptor { public: SourceInterceptor(int64_t interceptor_id, TaskNode* node); diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc index 4992a8b34c9da163af6bb64cad0094da9142afb2..e484031161489f4e6cd54403fbd15da0128433e8 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc @@ -25,57 +25,42 @@ limitations under the License. */ namespace paddle { namespace distributed { -class StartInterceptor : public Interceptor { - public: - StartInterceptor(int64_t interceptor_id, TaskNode* node) - : Interceptor(interceptor_id, node) { - RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); }); - } - - void NOP(const InterceptorMessage& msg) { - if (msg.message_type() == STOP) { - stop_ = true; - InterceptorMessage stop; - stop.set_message_type(STOP); - Send(1, stop); // stop 1, compute - return; - } - std::cout << GetInterceptorId() << " recv msg from " << msg.src_id() - << std::endl; - } -}; - TEST(ComputeInterceptor, Compute) { std::string carrier_id = "0"; Carrier* carrier = GlobalMap::Create(carrier_id, carrier_id); - carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}}); + carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}}); MessageBus* msg_bus = GlobalVal::Create(); msg_bus->Init(0, {{0, "127.0.0.0:0"}}, ""); // NOTE: don't delete, otherwise interceptor will use undefined node - TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0); // role, rank, task_id + TaskNode* source = + new TaskNode(0, SOURCE_ID, 3); // rank, task_id, max_run_times + TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0); TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0); - TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0); + TaskNode* sink = new TaskNode(0, SINK_ID, 3); - // a->b->c + // source->a->b->sink + source->AddDownstreamTask(0); + node_a->AddUpstreamTask(SOURCE_ID); node_a->AddDownstreamTask(1, 3); node_b->AddUpstreamTask(0, 3); - node_b->AddDownstreamTask(2); - node_c->AddUpstreamTask(1); + node_b->AddDownstreamTask(SINK_ID); + sink->AddUpstreamTask(1); - Interceptor* a = - carrier->SetInterceptor(0, std::make_unique(0, node_a)); + carrier->SetInterceptor( + SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source)); + carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a)); carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b)); - carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c)); + carrier->SetInterceptor(SINK_ID, + InterceptorFactory::Create("Sink", SINK_ID, sink)); + // start InterceptorMessage msg; - msg.set_message_type(DATA_IS_READY); - // test run three times - a->Send(1, msg); - a->Send(1, msg); - a->Send(1, msg); + msg.set_message_type(START); + msg.set_dst_id(SOURCE_ID); + carrier->EnqueueInterceptorMessage(msg); carrier->Wait(); carrier->Release(); diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc index 54adf06fb67ddf6e5d9ac803b3aa097289c33c38..f43f3860199fb772bc5d4537a41490a70c8270e5 100644 --- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc @@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor { void PingPong(const InterceptorMessage& msg) { if (msg.message_type() == STOP) { - stop_ = true; return; } std::cout << GetInterceptorId() << " recv msg, count=" << count_ diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc index 3828c4478cbe6eecad18a88ce5501eae84eb0589..62c23068d7d4a9eb6574aacc53d0c258ae2ddc51 100644 --- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc @@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor { void PingPong(const InterceptorMessage& msg) { if (msg.message_type() == STOP) { - stop_ = true; StopCarrier(); return; } diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt index 99f5e789081ba69794f189aeff0cfa8e72f7d34a..d272055d5deda8b01cb58b6a4279c56bc4ce224a 100644 --- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt @@ -22,6 +22,10 @@ if(WITH_ROCM) target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB}) endif() +if(WITH_CINN) + target_link_libraries(eager_generator ${PYTHON_LIBRARIES}) +endif() + # Prepare file structure message( "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated" diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index aa3695e7fbf4fe3e313458179eee83bbdd40fcdc..f21ca0c858acc0245333985e4e4d8c52421aa57e 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -47,7 +47,7 @@ void SetFeedVariable(Scope* scope, } void SetFeedVariable(Scope* scope, - const Strings& input, + const std::vector& input, const std::string& var_name, size_t index) { // If var_name Variable is not found in GlobalScope, a new variable will @@ -59,7 +59,7 @@ void SetFeedVariable(Scope* scope, feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index] = input; + feed_inputs[index] = Strings(input); } FetchType& GetFetchVariable(const Scope& scope, diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 356e6d3ddf8cfd6802bc1fe192b6c134b8a94f73..b3d5c91994db5caf9ec1773fdf242f4bd7be6d8b 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -35,7 +35,7 @@ void SetFeedVariable(Scope* scope, size_t index); void SetFeedVariable(Scope* scope, - const Strings& input, + const std::vector& input, const std::string& var_name, size_t index); diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 981a303cd191590546eb75618cc36b4abee63e73..571667bff47eb454c0366398faa6d04d84448219 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -19,12 +19,14 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/string_array.h" +#include "paddle/phi/core/extended_tensor.h" namespace paddle { namespace framework { using FeedType = paddle::variant; -using FeedList = std::vector; + +using FeedList = paddle::framework::PhiVector; using FetchType = paddle::variant +void InferShapeUtilsTestOutputKernel(const Context& dev_ctx, + const phi::DenseTensor& x, + phi::SparseCooTensor* out) { + VLOG(6) << "Come into InferShapeUtilsTestOutputKernel"; +} + } // namespace framework } // namespace paddle @@ -143,6 +182,21 @@ PD_REGISTER_KERNEL(infer_shape_utils_test, paddle::framework::InferShapeUtilsTestKernel, int) {} +DECLARE_INFER_SHAPE_FUNCTOR( + infer_shape_utils_test_output, + InferShapeUtilsTestOutputInferShapeFunctor, + PD_INFER_META(paddle::framework::TestOutputInferMeta)); +REGISTER_OPERATOR(infer_shape_utils_test_output, + paddle::framework::InferShapeUtilsTestOutputOp, + paddle::framework::InferShapeUtilsTestOutputOpMaker, + InferShapeUtilsTestOutputInferShapeFunctor); + +PD_REGISTER_KERNEL(test_sparse_coo_tensor_output, + CPU, + ALL_LAYOUT, + paddle::framework::InferShapeUtilsTestOutputKernel, + int) {} + TEST(InferShapeUtilsTest, ALL) { paddle::framework::ProgramDesc prog; paddle::framework::proto::BlockDesc proto_block; @@ -200,3 +254,27 @@ TEST(InferShapeUtilsTest, ALL) { op->InferShape(block_desc); } + +TEST(InferShapeUtilsTestOutput, ALL) { + paddle::framework::ProgramDesc prog; + paddle::framework::proto::BlockDesc proto_block; + paddle::framework::BlockDesc block_desc(&prog, &proto_block); + + auto* op = block_desc.AppendOp(); + op->SetType("infer_shape_utils_test_output"); + + auto* x = block_desc.Var("x"); + x->SetType(paddle::framework::proto::VarType::LOD_TENSOR); + x->SetDataType(paddle::framework::proto::VarType::FP32); + op->SetInput("X", {"x"}); + + auto* out = block_desc.Var("out"); + out->SetType(paddle::framework::proto::VarType::SPARSE_COO); + op->SetOutput("Out", {"out"}); + + phi::OpUtilsMap::Instance().InsertArgumentMappingFn( + "infer_shape_utils_test_output", + paddle::framework::TestSparseOutputOpArgumentMapping); + + op->InferShape(block_desc); +} diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6eda1f4b23f8e57cbadfa30cd99d0a2514a87be9..b387dc1d6cc26b5cd0cf6ea45014f8986c4035ab 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -460,14 +460,6 @@ if(WITH_MKLDNN) test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) - cc_test( - test_reshape_transpose_matmul_mkldnn_fuse_pass - SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc - DEPS reshape_transpose_matmul_mkldnn_fuse_pass) - cc_test( - test_matmul_transpose_reshape_fuse_pass - SRCS mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc - DEPS matmul_transpose_reshape_mkldnn_fuse_pass) cc_test( test_shuffle_channel_mkldnn_detect_pass SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc deleted file mode 100644 index 4149bb2347317a00cd3094bd535ddb9ae704463d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -void SetOp(ProgramDesc *prog, - const std::string &type, - const std::vector &inputs, - const std::vector &outputs) { - auto *op = prog->MutableBlock(0)->AppendOp(); - op->SetType(type); - op->SetInput("X", {inputs[0]}); - op->SetOutput("Out", {outputs[0]}); - if (type == "transpose2") { - op->SetAttr("axis", std::vector({0, 2, 1, 3})); - op->SetOutput("XShape", {outputs[1]}); - } - if (type == "reshape2") { - op->SetAttr("shape", std::vector({4, 5, 6})); - op->SetOutput("XShape", {outputs[1]}); - } - - if (type == "matmul") { - op->SetInput("Y", {inputs[1]}); - op->SetAttr("use_mkldnn", true); - op->SetAttr("alpha", 1.0f); - op->SetAttr("transpose_X", true); - op->SetAttr("transpose_Y", true); - } - if (type == "matmul_v2") { - op->SetInput("Y", {inputs[1]}); - op->SetAttr("use_mkldnn", true); - op->SetAttr("trans_x", true); - op->SetAttr("trans_y", true); - } -} - -ProgramDesc BuildProgramDesc(const std::string &op_name) { - ProgramDesc prog; - for (auto &v : std::initializer_list( - {"a1", "a2", "b", "c", "cx", "d", "dx", "e"})) { - auto *var = prog.MutableBlock(0)->Var(v); - var->SetType(proto::VarType::SELECTED_ROWS); - } - - SetOp(&prog, op_name, {"a1", "a2"}, {"b"}); - SetOp(&prog, "transpose2", {"b"}, {"c", "cx"}); - SetOp(&prog, "reshape2", {"c"}, {"d", "dx"}); - SetOp(&prog, "fc", {"d"}, {"e"}); - - return prog; -} - -void MainTest(const ProgramDesc &prog, const std::string &op_name) { - std::unique_ptr graph(new ir::Graph(prog)); - - int original_nodes_num = graph->Nodes().size(); - - auto pass = - PassRegistry::Instance().Get("matmul_transpose_reshape_mkldnn_fuse_pass"); - graph.reset(pass->Apply(graph.release())); - - int current_nodes_num = graph->Nodes().size(); - EXPECT_EQ(original_nodes_num - 6, current_nodes_num); - - for (auto *node : graph->Nodes()) { - if (node->IsOp()) { - auto *op = node->Op(); - if (op->Type() == op_name) { - EXPECT_EQ(op->GetAttrIfExists>("fused_reshape_Out"), - std::vector({4, 5, 6})); - EXPECT_EQ(op->GetAttrIfExists>("fused_transpose_Out"), - std::vector({0, 2, 1, 3})); - } - } - } -} - -TEST(MatmulTransposeReshapeFusePass, matmul_fuse_pass) { - auto prog = BuildProgramDesc("matmul"); - MainTest(prog, "matmul"); -} - -TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) { - auto prog = BuildProgramDesc("matmul_v2"); - MainTest(prog, "matmul_v2"); -} -} // namespace ir -} // namespace framework -} // namespace paddle - -USE_PASS(matmul_transpose_reshape_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc deleted file mode 100644 index 2dd13573d98a054167db0a7686d106fb151af605..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h" -#include "paddle/fluid/framework/ir/pass_tester_helper.h" - -namespace paddle { -namespace framework { -namespace ir { - -void AddVarToScope(Scope* param_scope, - const std::string& name, - const DDim& dims) { - auto* tensor = param_scope->Var(name)->GetMutable(); - tensor->Resize(dims); - tensor->mutable_data(phi::CPUPlace()); -} - -Scope* CreateParamScope() { - auto param_scope = new Scope(); - AddVarToScope(param_scope, "w1", {768, 768}); - AddVarToScope(param_scope, "bias1", {768}); - AddVarToScope(param_scope, "w2", {768, 768}); - AddVarToScope(param_scope, "bias2", {768}); - return param_scope; -} - -void TestMain(const std::string& op_name, bool with_xshapes) { - // inputs operator output - // ----------------------------------------------- - // a1,w1,bias1 fc -> b1 - // b1 reshape -> c1 - // c1 transpose -> d1 - // a2,w2,bias2 fc -> b2 - // b2 reshape -> c2 - // c2 transpose -> d2 - // (d1, d2) matmul(_v2) -> (...) - Layers layers; - auto* a1 = layers.data("a1", {-1, 128, 768}); - auto* w1 = layers.data("w1", {768, 768}, true); - auto* bias1 = layers.data("bias1", {768}, true); - auto* b1 = layers.fc(a1, w1, bias1, 2); - b1->SetShape({-1, 128, 768}); - auto* c1 = layers.reshape2(b1, {0, 0, 12, 64}, with_xshapes); - c1->SetShape({-1, 128, 12, 64}); - auto* d1 = layers.transpose2(c1, {0, 2, 1, 3}, with_xshapes); - d1->SetShape({-1, 12, 128, 64}); - auto* a2 = layers.data("a2", {-1, 128, 768}); - auto* w2 = layers.data("w2", {768, 768}, true); - auto* bias2 = layers.data("bias2", {768}, true); - auto* b2 = layers.fc(a2, w2, bias2, 2); - b2->SetShape({-1, 128, 768}); - auto* c2 = layers.reshape2(b2, {0, 0, 12, 64}); - c2->SetShape({-1, 128, 12, 64}); - auto* d2 = layers.transpose2(c2, {0, 2, 1, 3}); - d2->SetShape({-1, 12, 128, 64}); - if (op_name == "matmul_v2") { - layers.matmul_v2(d1, d2); - } else { - layers.matmul(d1, d2); - } - - std::unique_ptr graph(new ir::Graph(layers.main_program())); - graph->Set("__param_scope__", CreateParamScope()); - - int num_reshape_nodes_before = GetNumOpNodes(graph, "reshape2"); - int num_transpose_nodes_before = GetNumOpNodes(graph, "transpose2"); - int total_nodes_before = graph->Nodes().size(); - VLOG(3) << DebugString(graph); - - auto pass = - PassRegistry::Instance().Get("reshape_transpose_matmul_mkldnn_fuse_pass"); - graph.reset(pass->Apply(graph.release())); - - int num_reshape_nodes_after = GetNumOpNodes(graph, "reshape2"); - int num_transpose_nodes_after = GetNumOpNodes(graph, "transpose2"); - int total_nodes_after = graph->Nodes().size(); - VLOG(3) << DebugString(graph); - - EXPECT_EQ(num_reshape_nodes_before, 2); - EXPECT_EQ(num_reshape_nodes_after, 0); - EXPECT_EQ(num_transpose_nodes_before, 2); - EXPECT_EQ(num_transpose_nodes_after, 0); - int removed = 8; // 2* reshape, reshape_out, transpose, transpose_out - if (with_xshapes) removed += 2; // transpose_xshape, reshape_xshape - EXPECT_EQ(total_nodes_before - removed, total_nodes_after); - auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op(); - - auto check = [&matmul_op_desc](std::string a) { - std::string shape_str = "fused_reshape_" + a; - auto shape = matmul_op_desc->GetAttrIfExists>(shape_str); - EXPECT_EQ(shape, (std::vector{0, 0, 12, 64})); - std::string axis_str = "fused_transpose_" + a; - auto axis = matmul_op_desc->GetAttrIfExists>(axis_str); - EXPECT_EQ(axis, (std::vector{0, 2, 1, 3})); - }; - check("X"); - check("Y"); -} - -TEST(ReshapeTransposeMatmulMkldnnFusePass, - both_matmul_inputs_reshape_transpose) { - TestMain("matmul", false); -} - -TEST(ReshapeTransposeMatmulMkldnnFusePass, - both_matmul_inputs_reshape_transpose_one_with_xshapes) { - TestMain("matmul", true); -} - -TEST(ReshapeTransposeMatmulV2MkldnnFusePass, - both_matmulv2_inputs_reshape_transpose) { - TestMain("matmul_v2", false); -} - -TEST(ReshapeTransposeMatmulV2MkldnnFusePass, - both_matmulv2_inputs_reshape_transpose_one_with_xshapes) { - TestMain("matmul_v2", true); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -USE_PASS(reshape_transpose_matmul_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc index e7e925a47797faf6aa2457ca78e62b8a6ee1bef2..73e6664f66f1e04a810e4ed58d13f5b7c05e528e 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc @@ -19,14 +19,27 @@ #include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" DECLARE_bool(fast_eager_deletion_mode); +DECLARE_bool(new_executor_use_cuda_graph); namespace paddle { namespace framework { bool IsInterpretercoreFastGCEnabled() { - return memory::allocation::AllocatorFacade::Instance() - .IsStreamSafeCUDAAllocatorUsed() && - FLAGS_fast_eager_deletion_mode; + // When using cuda graph, fast GC must be used. Because + // `EventQuery` method in event GC cannot be used in + // cuda graph. + PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance() + .IsStreamSafeCUDAAllocatorUsed() == false && + FLAGS_new_executor_use_cuda_graph, + false, + platform::errors::InvalidArgument( + "When FLAGS_new_executor_use_cuda_graph is true, " + "IsStreamSafeCUDAAllocatorUsed must be true, but " + "got false.")); + return (memory::allocation::AllocatorFacade::Instance() + .IsStreamSafeCUDAAllocatorUsed() && + FLAGS_fast_eager_deletion_mode) || + FLAGS_new_executor_use_cuda_graph; } InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 4f2a4f48b7f99749963eaa39160360b54a5620d0..63525330ea60debc6db363d96f6049153cd4550a 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -31,6 +31,7 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/phi/backends/device_manager.h" PADDLE_DEFINE_EXPORTED_bool( @@ -50,6 +51,10 @@ PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor, DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); +DECLARE_bool(new_executor_use_cuda_graph); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DECLARE_bool(sync_nccl_allreduce); +#endif constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kTaskCompletion = "TaskCompletion"; @@ -142,6 +147,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place, } return lhs_prority > rhs_prority; }; + + PrepareForCUDAGraphCapture(); } InterpreterCore::~InterpreterCore() { @@ -161,6 +168,7 @@ interpreter::CostInfo InterpreterCore::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { SetDeviceId(place_); + CheckCUDAGraphBeforeRun(feed_names); Prepare(feed_names, feed_tensors, true); interpreter::CostInfo cost_info; @@ -221,6 +229,7 @@ paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names, const std::vector& feed_tensors) { SetDeviceId(place_); + CheckCUDAGraphBeforeRun(feed_names); #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); @@ -240,7 +249,16 @@ paddle::framework::FetchList InterpreterCore::Run( // return Fetch Tensors auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName); if (fetch_var) { - return std::move(*fetch_var->GetMutable()); + auto fetch_list = std::move(*fetch_var->GetMutable()); +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_list.empty(), + true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + } +#endif + return fetch_list; } else { return {}; } @@ -249,6 +267,7 @@ paddle::framework::FetchList InterpreterCore::Run( paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names, bool need_fetch) { SetDeviceId(place_); + CheckCUDAGraphBeforeRun(feed_names); #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); @@ -290,7 +309,16 @@ paddle::framework::FetchList InterpreterCore::Run( HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope(); auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); if (fetch_var && need_fetch) { - return std::move(*fetch_var->GetMutable()); + auto fetch_list = std::move(*fetch_var->GetMutable()); +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_list.empty(), + true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + } +#endif + return fetch_list; } else { return {}; } @@ -504,6 +532,67 @@ void InterpreterCore::BuildInplace() { } } +void InterpreterCore::PrepareForCUDAGraphCapture() { + if (!FLAGS_new_executor_use_cuda_graph) return; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_EQ( + platform::IsCUDAGraphCapturing(), + false, + platform::errors::PermissionDenied("CUDA Graph is not allowed to capture " + "when running the first batch.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_), + true, + platform::errors::InvalidArgument( + "CUDA Graph is only supported on NVIDIA GPU device.")); + // If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce. + // which may cause error in cuda graph. This behavior is consistent with PE. + PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, + false, + platform::errors::InvalidArgument( + "FLAGS_sync_nccl_allreduce must be False to support " + "CUDA Graph capturing.")); + + // All output vars of coalesce_tensor op should not be gc. + // If fused output var of coalesce_tensor is gc, it will cause accuracy + // problem. The specific reasons need to be analyzed. + for (auto& op_desc : block_.AllOps()) { + if (op_desc->Type() == kCoalesceTensor) { + for (auto& out_var_name : op_desc->OutputArgumentNames()) { + execution_config_.skip_gc_vars.insert(out_var_name); + VLOG(4) << "Insert Var(" << out_var_name << ") into skip_gc_vars."; + } + } + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + +void InterpreterCore::CheckCUDAGraphBeforeRun( + const std::vector& feed_names) { +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ( + feed_names.empty(), + true, + platform::errors::InvalidArgument( + "Feeding data is not permitted when capturing CUDA Graph.")); + PADDLE_ENFORCE_EQ( + FLAGS_new_executor_use_cuda_graph, + true, + platform::errors::InvalidArgument( + "You must turn on FLAGS_new_executor_use_cuda_graph to True " + "to enable CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + place_, + platform::CUDAGraphCapturingPlace(), + platform::errors::InvalidArgument("The place to capture CUDAGraph is " + "not the same as the place to run.")); + } +#endif +} + void InterpreterCore::BuildOperatorDependences() { // analysis the dependences between ops, add next_instr_list to each instr, // and set the dependecy_count_ diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 74ff5c563652ea486d552c4d1ecf2cbb363fa04d..53625c87938305c6a22909d70352d0cb1095b1d0 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -97,6 +97,10 @@ class InterpreterCore { const std::vector>& input_var2op, size_t var_index); void SetFeedVarsInplaceSkip(const std::vector& feed_names); + // cuda graph + void CheckCUDAGraphBeforeRun(const std::vector& feed_names); + void PrepareForCUDAGraphCapture(); + // execution void RunImpl(); void ExecuteInstructionList(const std::vector& vec_instr); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5e8d0b1b87ae27a0dd40ac3e29a31cf149e14576..fe863381b570bba755d04e5c74e8bdd1b4630eef 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1561,6 +1561,63 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->Info().infer_shape_(&infer_shape_ctx); } +template +bool HasSameTensorType(phi::TensorBase* phi_tensor, Variable* var) { + if (phi_tensor == nullptr && var == nullptr) { + return true; + } else if (phi_tensor != nullptr && var != nullptr) { + if (T::classof(phi_tensor) && var->IsType()) { + return true; + } + } + return false; +} + +// TODO(YuanRisheng): We need collect all `need_prepare_phi_data_` +// into this function. +void OperatorWithKernel::CheckWhetherPreparePhiData( + const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) const { + if (run_phi_kernel_ && impl_ != nullptr) { + const auto& phi_kernel_context = impl_->getKernelContext(); + size_t phi_tensor_index = 0; + // Check each tensor in KernelContext, if there is a tensor that has + // different type with variable. The PhiKernelContext need be reconstructed. + // We use kernel_signature_'s output to retrieve tensor. Because the tensor + // in phi_kernel_context stored in the order of kernel_signature_'s output. + if (phi_kernel_context->OutputsSize() >= phi_tensor_index || + kernel_signature_ == nullptr) { + need_prepare_phi_data_ = true; + return; + } + + const auto& phi_output_names = kernel_signature_->output_names; + for (auto& phi_output_name : phi_output_names) { + const auto& iter = outnames.find(phi_output_name); + if (iter != outnames.end()) { + for (auto& var_name : iter->second) { + auto var_output = scope.FindVar(var_name); + auto phi_output = + phi_kernel_context->MutableOutputAt( + phi_tensor_index); + if (phi_output == nullptr) { + continue; + } + if (!(HasSameTensorType(phi_output, var_output) || + HasSameTensorType(phi_output, + var_output) || + HasSameTensorType(phi_output, + var_output))) { + need_prepare_phi_data_ = true; + } + phi_tensor_index++; + } + } + } + } +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { // To reduce the elapsed time of HasAttr, we use bool variable to record the @@ -1571,6 +1628,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, HasAttr(kAllKernelsMustComputeRuntimeShape)) all_kernels_must_compute_runtime_shape_ = true; const Scope* cur_scope = &scope; + CheckWhetherPreparePhiData(Inputs(), Outputs(), scope); if (!enable_cache_runtime_context_) { RuntimeContext ctx(Inputs(), Outputs(), scope); RunImpl(scope, place, &ctx); @@ -2993,7 +3051,6 @@ void OperatorWithKernel::BuildPhiKernelContext( "to the size of kernel attribute_defs (%d).", attr_names.size(), attr_defs.size())); - for (size_t i = 0; i < input_names.size(); ++i) { auto it = ctx.inputs.find(input_names[i]); @@ -3037,6 +3094,9 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (var->IsType()) { tensor_in = &(var->Get()); phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var->IsType()) { + tensor_in = &(var->Get()); + phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", @@ -3047,7 +3107,6 @@ void OperatorWithKernel::BuildPhiKernelContext( phi_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done inputs"; - for (size_t i = 0; i < output_names.size(); ++i) { auto it = ctx.outputs.find(output_names[i]); size_t start_idx = @@ -3087,6 +3146,9 @@ void OperatorWithKernel::BuildPhiKernelContext( // Note: If the input LoDTensorArray size is 0, the output // LoDTensorArray is also 0 phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); @@ -3108,7 +3170,6 @@ void OperatorWithKernel::BuildPhiKernelContext( i); } VLOG(4) << "Done outputs"; - for (size_t i = 0; i < attr_names.size(); ++i) { VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": " << attr_defs[i].type_index; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index b4e0c94c20be2d189a03fe73d59bd46447acb1c8..955f30f3406190a5bd3bd2a77161f3b0a5c3336c 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -550,6 +550,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { return var->IsType(); } + bool IsSparseCooTensorOutput(const std::string& name) const override { + auto vars = ctx_.MultiOutputVar(name); + return std::all_of(vars.begin(), vars.end(), [](const Variable* var) { + return var->IsType(); + }); + } + bool IsSparseCsrTensorInput(const std::string& name) const override { const auto* var = ctx_.InputVar(name); return var->IsType(); @@ -746,6 +753,10 @@ class OperatorWithKernel : public OperatorBase { RuntimeContext* ctx, const phi::Place& place) const; + void CheckWhetherPreparePhiData(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) const; + void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, const Scope& exec_scope) const; diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index 1d8a9819a815d90548ec73db05d51101ab196a07..6bb21c569b30289e9c9eeed0a69b27b222965037 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -60,6 +60,7 @@ if(WITH_TESTING) elementwise_add_op generated_op) set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") + target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES}) cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc) diff --git a/paddle/fluid/framework/raw_tensor.h b/paddle/fluid/framework/raw_tensor.h index dfee5acd14af0ca7829b501cb7230db8011f2df1..60ccd6a5bae3e468e3272044b35b5ed88174c187 100644 --- a/paddle/fluid/framework/raw_tensor.h +++ b/paddle/fluid/framework/raw_tensor.h @@ -23,8 +23,8 @@ namespace paddle { namespace framework { /// \brief Fluid Kernel and PHI Kernel will be unified in the future. -/// So, we need a class in PHI that can represent the RAW type in Fluid. -/// The RawTensor is for PHI Kernel that has RAW type arguments. +/// So, we need a class in PHI that can represent the RawTensor type in Fluid. +/// The RawTensor is for PHI Kernel that has RawTensor type arguments. class RawTensor : public phi::ExtendedTensor, public phi::TypeInfoTraits { public: @@ -37,13 +37,35 @@ class RawTensor : public phi::ExtendedTensor, RawTensor& operator=(RawTensor&& other) = default; /// \brief Destroy the RawTensor and release exclusive resources. - virtual ~RawTensor() = default; + virtual ~RawTensor() { + if (!data_.empty()) { + data_deleter_(); + } + } public: /// \brief Returns the name of the class for type traits. /// \return The name of the class. static const char* name() { return "RawTensor"; } + template + T& Get() const { + PADDLE_ENFORCE_EQ(data_.empty(), + false, + platform::errors::PreconditionNotMet( + "The data in RawTensor is empty. Please set data " + "before using it.")); + + try { + return *(paddle::any_cast(data_)); + } catch (paddle::bad_any_cast&) { + PADDLE_THROW(phi::errors::InvalidArgument( + "Invalid data type error, expected %s, actual %s.", + typeid(T).name(), + data_type_.name())); + } + } + template T* GetMutable() { if (!data_.empty()) { @@ -70,7 +92,7 @@ class RawTensor : public phi::ExtendedTensor, private: paddle::any data_; - std::function data_deleter_; + std::function data_deleter_ = []() {}; std::type_index data_type_ = std::type_index(typeid(void)); }; diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h index 4ac8d89981bee4d143e4452e255f22b7cc207716..9fd245ff91765893971558795dcd67d6e63f1533 100644 --- a/paddle/fluid/framework/string_array.h +++ b/paddle/fluid/framework/string_array.h @@ -25,6 +25,10 @@ limitations under the License. */ namespace paddle { namespace framework { +// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't +// recommend widely use it. Because faster_tokenizer_op may be deleted in the +// future and this class will be deleted. + class Vocab : public phi::ExtendedTensor, public phi::TypeInfoTraits { public: @@ -94,8 +98,73 @@ class Vocab : public phi::ExtendedTensor, std::unordered_map data_; }; +// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI +// Kernel. It can be used when you define a non-tensor type that needs to be +// stored in a vector as PHI kernel argument. + +template +class PhiVector : public phi::ExtendedTensor, + public phi::TypeInfoTraits> { + public: + PhiVector() = default; + + explicit PhiVector(const std::vector& init_data) : data_(init_data) {} + + PhiVector(PhiVector&& other) = default; + + PhiVector(const PhiVector& other) = default; + + PhiVector& operator=(const PhiVector& other) = default; + + PhiVector& operator=(const std::vector& other) { + data_ = other; + return *this; + } + + PhiVector& operator=(PhiVector&& other) = default; + + /// \brief Destroy the PhiVector and release exclusive resources. + virtual ~PhiVector() = default; + + public: + /// \brief Returns the name of the class for type traits. + /// \return The name of the class. + static const char* name() { + return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str(); + } + + size_t size() const { return data_.size(); } + + void resize(size_t size) { data_.resize(size); } + + void clear() { data_.clear(); } + + void emplace_back(const T& feed_data) { data_.emplace_back(feed_data); } + + const T& operator[](size_t index) const { return data_[index]; } + + T& operator[](size_t index) { return data_[index]; } + + T& at(size_t index) { return data_.at(index); } + + const T& at(size_t index) const { return data_.at(index); } + + typename std::vector::iterator begin() { return data_.begin(); } + + typename std::vector::const_iterator begin() const { + return data_.begin(); + } + + typename std::vector::iterator end() { return data_.end(); } + + typename std::vector::const_iterator end() const { return data_.end(); } + + private: + std::vector data_; +}; + using String = std::string; -using Strings = std::vector; +using Strings = PhiVector; // Convert the std::string type to the std::string type. bool ConvertStrToWstr(const std::string& src, std::wstring* res); diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 1e6c110e86a30fd84f6d277cfb021213b9dfedcb..fab9d28abbac4eac750901ea7b2a504f9f33e8a3 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -221,6 +221,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< Vocab, std::vector, std::vector, + std::vector, RawTensor>; template struct VarTypeTrait { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3a54c7b4ed2e0de110dcf2a359438fa258df48f4..f4c35b49a0d00760f78f3d22a1dab1ee0ffae010 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1655,7 +1655,8 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( auto custom_place = place_; auto paddleplace = static_cast( static_cast(PaddlePlace::kCUSTOM) + - phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); + phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); res->SetPlace(paddleplace, custom_place.GetDeviceId()); } else { auto gpu_place = place_; @@ -1710,7 +1711,8 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( auto custom_place = place_; auto paddleplace = static_cast( static_cast(PaddlePlace::kCUSTOM) + - phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); + phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); res->SetPlace(paddleplace, custom_place.GetDeviceId()); } else { auto gpu_place = place_; diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index 02d5f91d630ce8a2108e76ecfbceb7695bb18bd2..5d2357d362e990bcf1f8ae54e71148884cc4b19f 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -25,13 +25,16 @@ if(WITH_ONNXRUNTIME) cc_library( zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc - DEPS onnxruntime) + DEPS onnxruntime phi_enforce) else() cc_library( zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) - cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) + cc_library( + zero_copy_tensor_dummy + SRCS zero_copy_tensor_dummy.cc + DEPS phi_enforce) endif() cc_test( diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index e7cda8707c872471c7e54816652c24d765077302..59b44769ddd38b073947a4096b498a20f17e25d1 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_tensor.h" #include "paddle/fluid/memory/memcpy.h" @@ -76,7 +77,8 @@ void Tensor::ReshapeStrings(const size_t &shape) { var, paddle::platform::errors::PreconditionNotMet( "No tensor called [%s] in the runtime scope", name_)); - paddle_infer::Strings *tensor = var->GetMutable(); + paddle::framework::Strings *tensor = + var->GetMutable(); tensor->resize(shape); } @@ -261,7 +263,9 @@ void Tensor::CopyFromCpu(const T *data) { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); paddle::platform::CustomPlace custom_place( - phi::GetGlobalDeviceType(device_type_id), device_); + phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType( + device_type_id), + device_); auto *t_data = tensor->mutable_data(custom_place); auto *dev_ctx = static_cast( pool.Get(custom_place)); @@ -354,7 +358,7 @@ void Tensor::ShareExternalData(const T *data, } void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { - EAGER_GET_TENSOR(paddle_infer::Strings); + EAGER_GET_TENSOR(paddle::framework::Strings); PADDLE_ENFORCE_GE(tensor->size(), 0, paddle::platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc index f7667c6df9eda16370bc707b73bc301596dd98d0..805813cbe153c93e760494f763eadb7847c56bb6 100644 --- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc +++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc @@ -112,6 +112,12 @@ bool PluginArgumentMappingContext::IsSparseCooTensorInput( const std::string& name) const { return false; } + +bool PluginArgumentMappingContext::IsSparseCooTensorOutput( + const std::string& name) const { + return false; +} + bool PluginArgumentMappingContext::IsSparseCsrTensorInput( const std::string& name) const { return false; diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h index 088e966a0cca742a9ce583aeb84d2c3ab21244a6..c0c30f3ac57b098da9ad9cabfc5a90ed55974d5e 100644 --- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h +++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h @@ -56,6 +56,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext { bool IsDenseTensorOutput(const std::string& name) const override; + bool IsSparseCooTensorOutput(const std::string& name) const override; + bool IsSelectedRowsOutput(const std::string& name) const override; bool IsForInferShape() const override { return false; } diff --git a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc index 75716a91f574f7c11cd331ab05fd62865894a7c8..d514fa0bb1af82abbf178586ccf77f47d44c6285 100644 --- a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc +++ b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc @@ -124,6 +124,7 @@ TEST(ArgMappingContexTest, BasicFunction) { EXPECT_EQ(context.IsDenseTensorOutput("Out"), false); EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false); + EXPECT_EQ(context.IsSparseCooTensorOutput("Out"), false); EXPECT_EQ(context.IsForInferShape(), false); } diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index 557a9cf333a3b95caf60065c5a337e989493f61e..55529b58aeb5f1cb4efe6aedd9c582b5f174e4c3 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -26,6 +26,8 @@ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" +DECLARE_bool(use_shm_cache); + namespace paddle { namespace memory { namespace allocation { @@ -111,20 +113,33 @@ void AllocateMemoryMap( std::shared_ptr AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, - size_t size) { + size_t size, + int buffer_id) { int fd = -1; void *base_ptr = nullptr; - AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); + if (buffer_id == -1) { + AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); + VLOG(4) << "Create and mmap a new shm: " << filename; + } else { + base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_; + VLOG(4) << "Get a cached shm " << filename; + } void *aliged_base_ptr = static_cast(static_cast(base_ptr) + mmap_alignment); return std::make_shared( - aliged_base_ptr, size, filename, flags, fd); + aliged_base_ptr, size, filename, flags, fd, buffer_id); } RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( - void *ptr, size_t size, std::string ipc_name, int fd, int flags) + void *ptr, + size_t size, + std::string ipc_name, + int fd, + int flags, + int buffer_id) : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { // must reset base ptr first. + buffer_id_ = buffer_id; resetBaseptr(); initializeRefercount(); } @@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() { } void RefcountedMemoryMapAllocation::close() { + VLOG(4) << "Close a RefcountedMemoryMapAllocation: " << ipc_name_; if (closed_) { return; } closed_ = true; void *data = map_ptr_; CountInfo *info = reinterpret_cast(data); - if (--info->refcount == 0) { - shm_unlink(ipc_name_.c_str()); - VLOG(6) << "shm_unlink file: " << ipc_name_; + --info->refcount; + if (FLAGS_use_shm_cache && buffer_id_ != -1) { + return; + } else { + if (FLAGS_use_shm_cache && + MemoryMapAllocationPool::Instance().BufferSize() < + static_cast( + MemoryMapAllocationPool::Instance().MaxPoolSize())) { + MemoryMapAllocationPool::Instance().Insert(MemoryMapInfo( + flags_, map_size_ - mmap_alignment, ipc_name_, map_ptr_)); + } else { + if (info->refcount == 0 && + shm_open(ipc_name_.c_str(), O_RDWR, (mode_t)0600) != -1) { + shm_unlink(ipc_name_.c_str()); + VLOG(6) << "shm_unlink file: " << ipc_name_; + } + + PADDLE_ENFORCE_NE(munmap(map_ptr_, map_size_), + -1, + platform::errors::Unavailable( + "could not unmap the shared memory file: ", + strerror(errno), + " (", + errno, + ")")); + } } - - PADDLE_ENFORCE_NE( - munmap(map_ptr_, map_size_), - -1, - platform::errors::Unavailable("could not unmap the shared memory file: ", - strerror(errno), - " (", - errno, - ")")); } MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { @@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() { MemoryMapFdSet::~MemoryMapFdSet() { Clear(); } +MemoryMapAllocationPool *MemoryMapAllocationPool::pool_ = nullptr; + +void MemoryMapAllocationPool::Insert(const MemoryMapInfo &memory_map) { + std::lock_guard guard(mtx_); + memory_map_allocations_.push_back(memory_map); + VLOG(4) << this << "Intsert a new shm: " << memory_map.file_name_; +} + +int MemoryMapAllocationPool::FindFromCache(const int &flag, + const size_t &data_size, + const std::string &file_name, + bool check_refcount) { + std::lock_guard guard(mtx_); + for (size_t idx = 0; idx < memory_map_allocations_.size(); idx++) { + if (memory_map_allocations_.at(idx).flags_ == flag && + memory_map_allocations_.at(idx).data_size_ == data_size) { + if (file_name == "" || + memory_map_allocations_.at(idx).file_name_ == file_name) { + if (!check_refcount || reinterpret_cast( + memory_map_allocations_.at(idx).mmap_ptr_) + ->refcount == 0) { + VLOG(4) << "Match at: " << idx; + return idx; + } + } + } + } + return -1; +} + +const MemoryMapInfo &MemoryMapAllocationPool::GetById(int id) { + std::lock_guard guard(mtx_); + return memory_map_allocations_.at(id); +} + +void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) { + max_pool_size_ = size; + VLOG(4) << this << "Set max pool size is: " << max_pool_size_; +} + +void MemoryMapAllocationPool::Clear() { + std::lock_guard guard(mtx_); + for (auto mmap : memory_map_allocations_) { + int rlt = shm_unlink(mmap.file_name_.c_str()); + if (rlt == 0) { + VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_; + } + PADDLE_ENFORCE_NE(munmap(mmap.mmap_ptr_, mmap.data_size_ + mmap_alignment), + -1, + platform::errors::Unavailable( + "could not unmap the shared memory file: ", + strerror(errno), + " (", + errno, + ")")); + } + memory_map_allocations_.clear(); +} + +MemoryMapAllocationPool::~MemoryMapAllocationPool() { Clear(); } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h index 3fc5d2d1891f245c0096d9a49b76121a2b13ce16..412e3a3545769dc09542b4fd921fc1d44bae2de5 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.h +++ b/paddle/fluid/memory/allocation/mmap_allocator.h @@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation { class RefcountedMemoryMapAllocation : public MemoryMapAllocation { public: - RefcountedMemoryMapAllocation( - void *ptr, size_t size, std::string ipc_name, int flags, int fd); + RefcountedMemoryMapAllocation(void *ptr, + size_t size, + std::string ipc_name, + int flags, + int fd, + int buffer_id = -1); void incref(); int decref(); @@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation { virtual ~RefcountedMemoryMapAllocation() { close(); } protected: + int buffer_id_ = -1; void initializeRefercount(); void resetBaseptr(); }; @@ -94,7 +99,8 @@ void AllocateMemoryMap( std::shared_ptr AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, - size_t size); + size_t size, + int buffer_id = -1); class MemoryMapWriterAllocation : public Allocation { public: @@ -153,6 +159,68 @@ class MemoryMapFdSet { std::mutex mtx_; }; +class MemoryMapInfo { + public: + explicit MemoryMapInfo(int flags, + size_t data_size, + std::string file_name, + void *mmap_ptr) + : flags_(flags), + data_size_(data_size), + file_name_(file_name), + mmap_ptr_(mmap_ptr) {} + + int flags_ = 0; + size_t data_size_ = 0; + std::string file_name_; + void *mmap_ptr_ = nullptr; +}; + +/* Note(zhangbo): +MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in +dataloader. The munmap(shm_mmap_ptr) instruction in +RefcountedMemoryMapAllocation::close() function may block other threads of the +process. Therefore, the logic of shm cache and reuse is designed: the shm +created by the _share_filename process will be cached and reused according to +the data_size of shm, thus eliminating the problem of munmap blocking other +threads +*/ +class MemoryMapAllocationPool { + public: + static MemoryMapAllocationPool &Instance() { + if (pool_ == nullptr) { + pool_ = new MemoryMapAllocationPool(); + } + return *pool_; + } + + void Insert(const MemoryMapInfo &memory_map); + + int FindFromCache(const int &flag, + const size_t &data_size, + const std::string &file_name = "", + bool check_refcount = true); + + const MemoryMapInfo &GetById(int id); + + size_t BufferSize() { return memory_map_allocations_.size(); } + + void Clear(); + + void SetMaxPoolSize(const int &size); + + int MaxPoolSize() { return max_pool_size_; } + + ~MemoryMapAllocationPool(); + + private: + MemoryMapAllocationPool() = default; + static MemoryMapAllocationPool *pool_; + std::vector memory_map_allocations_; + int max_pool_size_ = 0; + std::mutex mtx_; +}; + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index 78292738788f98c12558bdb3397b1367302adfad..3a97e275689ed032b3fdebd56d6b4be62c7b30af 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -44,6 +44,7 @@ if(WITH_TESTING) cinn_launch_context cinn_instruction_run_op cinn) + target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES}) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") @@ -73,6 +74,7 @@ if(WITH_TESTING) cinn_launch_op cinn_instruction_run_op elementwise_add_op) + target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES}) set_tests_properties( cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index e076ead891419c00292958fe170a16053b327164..194dccb0e6ea03ebbf16af5ff02d93bc883589cd 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -11,6 +11,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/raw_tensor.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" namespace paddle { @@ -28,117 +30,128 @@ class OpBase; namespace paddle { namespace operators { -// FeedVariableVisitor is to feed the variable data -// according to data type (phi::DenseTensor or Strings). -class FeedVariableVisitor { - public: - explicit FeedVariableVisitor(framework::Variable *out_var, - const platform::Place &place) - : out_var_(out_var), place_(place) {} - - void operator()(const phi::DenseTensor &in_tensor) const { - phi::DenseTensor *out_tensor = out_var_->GetMutable(); - if (platform::is_same_place(in_tensor.place(), place_)) { - out_tensor->ShareDataWith(in_tensor); -#ifdef PADDLE_WITH_IPU - } else if (platform::is_ipu_place(place_)) { - // For ipu, both in_tensor and out_tensor are allocated on cpu, - // PopART will copy tensor from host automatically, - // no TensorCopy() is required here. - out_tensor->ShareDataWith(in_tensor); -#endif - } else { - platform::DeviceContext *context = - platform::DeviceContextPool::Instance().Get(place_); - framework::TensorCopy(in_tensor, place_, *context, out_tensor); - } - out_tensor->set_lod(in_tensor.lod()); +const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x, + int col) { + PADDLE_ENFORCE_GE(col, + 0, + platform::errors::InvalidArgument( + "Expected the column index (the attribute 'col' of " + "operator 'Feed') of current feeding variable to be " + "no less than 0. But received column index = %d.", + col)); + auto feed_list = static_cast(&x); + PADDLE_ENFORCE_LT( + static_cast(col), + feed_list->size(), + platform::errors::InvalidArgument( + "The column index of current feeding variable is expected to be " + "less than the length of feeding list. But received column index = " + "%d, the length of feeding list = %d", + col, + feed_list->size())); + + return feed_list->at(static_cast(col)); +} + +template +void FeedDenseTensorKernel(const Context& dev_ctx, + const phi::ExtendedTensor& x, + int col, + phi::DenseTensor* out) { + PADDLE_ENFORCE_NOT_NULL( + out, + platform::errors::NotFound( + "Output cannot be found in scope for operator 'Feed'")); + const auto& feed_item = CheckAndGetFeedItem(x, col); + const auto& in_tensor = paddle::get(feed_item); + const auto& place = dev_ctx.GetPlace(); + if (platform::is_same_place(in_tensor.place(), place)) { + out->ShareDataWith(in_tensor); + } else { + framework::TensorCopy(in_tensor, place, dev_ctx, out); } - void operator()(const framework::Strings &in_str) const { - framework::Strings *out_str = out_var_->GetMutable(); - out_str->resize(in_str.size()); - *out_str = in_str; + out->set_lod(in_tensor.lod()); +} + +template +void FeedSparseCooTensorKernel(const Context& dev_ctx, + const phi::ExtendedTensor& x, + int col, + phi::SparseCooTensor* out) { + PADDLE_ENFORCE_NOT_NULL( + out, + platform::errors::NotFound( + "Output cannot be found in scope for operator 'Feed'")); + const auto& feed_item = CheckAndGetFeedItem(x, col); + const auto& in_tensor = paddle::get(feed_item); + const auto& place = dev_ctx.GetPlace(); + if (platform::is_same_place(in_tensor.place(), place)) { + *out = in_tensor; + } else { + phi::DenseTensor indices, values; + framework::TensorCopy(in_tensor.indices(), place, dev_ctx, &indices); + framework::TensorCopy(in_tensor.values(), place, dev_ctx, &values); + out->SetMember(indices, values, in_tensor.meta()); } +} + +template +void FeedStringsKernel(const Context& dev_ctx, + const phi::ExtendedTensor& x, + int col, + phi::ExtendedTensor* out) { + PADDLE_ENFORCE_NOT_NULL( + out, + platform::errors::NotFound( + "Output cannot be found in scope for operator 'Feed'")); + const auto& feed_item = CheckAndGetFeedItem(x, col); + auto strs_out = static_cast(out); + const auto& in_str = paddle::get(feed_item); + strs_out->resize(in_str.size()); + *strs_out = in_str; +} + +class FeedOp : public framework::OperatorWithKernel { + using framework::OperatorWithKernel::OperatorWithKernel; - void operator()(const phi::SparseCooTensor &in_tensor) const { - phi::SparseCooTensor *out_tensor = - out_var_->GetMutable(); - if (platform::is_same_place(in_tensor.place(), place_)) { - *out_tensor = in_tensor; - } else { - platform::DeviceContext *context = - platform::DeviceContextPool::Instance().Get(place_); - - phi::DenseTensor indices, values; - framework::TensorCopy(in_tensor.indices(), place_, *context, &indices); - framework::TensorCopy(in_tensor.values(), place_, *context, &values); - out_tensor->SetMember(indices, values, in_tensor.meta()); + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "feed"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "feed"); + if (ctx->IsRuntime()) { + framework::Variable* x_var = + PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]); + auto& x = x_var->Get(); + int col = ctx->Attrs().Get("col"); + auto& feed_item = x[col]; + if (feed_item.index() == 0) { + const auto& feed_item = CheckAndGetFeedItem(x, col); + auto& feed_tensor = PADDLE_GET_CONST(phi::DenseTensor, feed_item); + ctx->SetOutputDim("Out", feed_tensor.dims()); + } else if (feed_item.index() == 1) { + auto& feed_str = PADDLE_GET_CONST(framework::Strings, feed_item); + framework::Variable* out_var = + PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]); + out_var->GetMutable()->resize(feed_str.size()); + } else { + auto& feed_sparse_tensor = + PADDLE_GET_CONST(phi::SparseCooTensor, feed_item); + framework::Variable* out_var = + PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]); + out_var->GetMutable()->set_meta( + feed_sparse_tensor.meta()); + out_var->GetMutable()->SetCoalesced( + feed_sparse_tensor.coalesced()); + out_var->GetMutable()->SetIndicesDict( + feed_sparse_tensor.GetIndicesDict()); + } } } - private: - framework::Variable *out_var_; - const platform::Place &place_; -}; - -class FeedOp : public framework::OperatorBase { - public: - FeedOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - OP_INOUT_CHECK(HasInputs("X"), "Input", "X", "Feed"); - OP_INOUT_CHECK(HasOutputs("Out"), "Output", "Out", "Feed"); - - auto feed_var_name = Input("X"); - auto *feed_var = scope.FindVar(feed_var_name); - PADDLE_ENFORCE_NOT_NULL( - feed_var, - platform::errors::NotFound( - "Input varibale(%s) cannot be found in scope for operator 'Feed'.", - feed_var_name)); - - auto out_name = this->Output("Out"); - auto *out_var = scope.FindVar(out_name); - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "Output variable(%s) cannot be found in scope for operator 'Feed'", - out_name)); - - auto col = Attr("col"); - PADDLE_ENFORCE_GE(col, - 0, - platform::errors::InvalidArgument( - "Expected the column index (the attribute 'col' of " - "operator 'Feed') of current feeding variable to be " - "no less than 0. But received column index = %d.", - col)); - - VLOG(3) << "Feed variable " << feed_var_name << "'s " << col - << " column to variable " << out_name; - - auto &feed_list = feed_var->Get(); - PADDLE_ENFORCE_LT( - static_cast(col), - feed_list.size(), - platform::errors::InvalidArgument( - "The column index of current feeding variable is expected to be " - "less than the length of feeding list. But received column index = " - "%d, the length of feeding list = %d", - col, - feed_list.size())); - - auto &feed_item = feed_list.at(static_cast(col)); - - FeedVariableVisitor visitor(out_var, place); - paddle::visit(visitor, feed_item); + protected: + phi::KernelKey GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace()); } }; @@ -164,9 +177,152 @@ It should not be configured by users directly. } // namespace operators } // namespace paddle +// TODO(YuanRisheng): Maybe we need design a new registry macro for +// registering device independent kernels. + REGISTER_OPERATOR( feed, paddle::operators::FeedOp, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, paddle::operators::FeedOpInfoMaker); + +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + CPU, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} + +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + CPU, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} + +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + CPU, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} + +#if defined(PADDLE_WITH_MKLDNN) +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + OneDNN, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + OneDNN, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + OneDNN, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + GPU, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + GPU, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + GPU, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} +#elif defined(PADDLE_WITH_XPU) +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + XPU, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + XPU, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + XPU, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} +#elif defined(PADDLE_WITH_ASCEND_CL) +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + npu, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + npu, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + npu, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} +#elif defined(PADDLE_WITH_MLU) +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + CustomMLU, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + CustomMLU, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + CustomMLU, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} + +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) +PD_REGISTER_GENERAL_KERNEL( + feed_dense_tensor, + custom_cpu, + ALL_LAYOUT, + paddle::operators::FeedDenseTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_sparse_coo_tensor, + custom_cpu, + ALL_LAYOUT, + paddle::operators::FeedSparseCooTensorKernel, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL( + feed_strings, + custom_cpu, + ALL_LAYOUT, + paddle::operators::FeedStringsKernel, + ALL_DTYPE) {} +#endif diff --git a/paddle/fluid/operators/cum_op.cc b/paddle/fluid/operators/cum_op.cc index 6d1089ecf72a44850cfdd9e7cfd666fc15b5e7de..4c23020413ee5fbabbec88ce81439ce821df4008 100644 --- a/paddle/fluid/operators/cum_op.cc +++ b/paddle/fluid/operators/cum_op.cc @@ -33,6 +33,27 @@ class CumOp : public framework::OperatorWithKernel { } }; +class CumGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cumsum"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), + "Input", + "Out@GRAD", + "cumsum"); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + phi::KernelKey GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return phi::KernelKey(input_data_type, ctx.GetPlace()); + } +}; + class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -69,12 +90,13 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("cumsum"); - grad_op->SetInput("X", this->OutputGrad("Out")); - grad_op->SetOutput("Out", this->InputGrad("X")); + grad_op->SetType("cumsum_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); grad_op->SetAttrMap(this->Attrs()); grad_op->SetAttr("reverse", - !PADDLE_GET_CONST(bool, this->GetAttr("reverse"))); + PADDLE_GET_CONST(bool, this->GetAttr("reverse"))); } }; @@ -153,6 +175,7 @@ using CPU = phi::CPUContext; DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, PD_INFER_META(phi::CumScalarAxisInferMeta)); + DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp, LogcumsumexpInferShapeFunctor, PD_INFER_META(phi::CumInferMeta)); @@ -169,6 +192,7 @@ REGISTER_OPERATOR(logcumsumexp, ops::LogcumsumexpGradMaker, LogcumsumexpInferShapeFunctor); REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp); +REGISTER_OPERATOR(cumsum_grad, ops::CumGradOp); REGISTER_OP_VERSION(cumsum).AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc deleted file mode 100644 index 62cecbd36ae47b33eb77e3397dc13b5c6eda500e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/determinant_op.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_proto_maker.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class SlogDeterminantOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant"); - } -}; - -class SlogDeterminantOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Input", "(Tensor) The input tensor of SlogDeterminant."); - AddOutput("Out", - "(Tensor) The output tensor containing the sign of the" - "determinant and the natural logarithm" - "of the absolute value of determinant,"); - - AddComment(R"DOC( -SlogDeterminant Operator.)DOC"); - } -}; - -class SlogDeterminantGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Input"), "Input", "Input", "SlogDeterminantGradOp"); - OP_INOUT_CHECK( - ctx->HasInput("Out"), "Input", "Out", "SlogDeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "SlogDeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), - "Output", - framework::GradVarName("Input"), - "SlogDeterminantGradOp"); - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } -}; - -template -class SlogDeterminantGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("slogdeterminant_grad"); - grad_op->SetInput("Input", this->Input("Input")); - grad_op->SetInput("Out", this->Output("Out")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("Input"), - this->InputGrad("Input")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, - "Input"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant, - SlogDeterminantInferShapeFunctor, - PD_INFER_META(phi::UnchangedInferMeta)); -REGISTER_OPERATOR(slogdeterminant, - ops::SlogDeterminantOp, - ops::SlogDeterminantOpMaker, - ops::SlogDeterminantGradOpMaker, - ops::SlogDeterminantGradOpMaker, - SlogDeterminantInferShapeFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant_grad, - SlogDeterminantGradInferShapeFunctor, - PD_INFER_META(phi::GeneralUnaryGradInferMeta)); -REGISTER_OPERATOR(slogdeterminant_grad, - ops::SlogDeterminantGradOp, - SlogDeterminantGradInferShapeFunctor) // reuse det grad op diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 5048a40dddea4a0669291108b32e0441898f53d2..4052f3e09e0cc2f29271bd0a9977340d39af4700 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h" +#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" +#include "paddle/fluid/prim/utils/static/desc_tensor.h" namespace paddle { namespace operators { @@ -63,6 +66,34 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker { } }; +class ElementwiseMulGradCompositeOpMaker + : public prim::GradCompositeOpMakerBase { + using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase; + + public: + void Apply() override { + auto x = this->GetSingleForwardInput("X"); + auto y = this->GetSingleForwardInput("Y"); + auto out_grad = this->GetSingleOutputGrad("Out"); + auto x_grad = this->GetSingleInputGrad("X"); + auto x_grad_p = this->GetOutputPtr(&x_grad); + auto x_grad_name = this->GetOutputName(x_grad); + auto y_grad = this->GetSingleInputGrad("Y"); + auto y_grad_p = this->GetOutputPtr(&y_grad); + auto y_grad_name = this->GetOutputName(y_grad); + prim::multiply_grad( + x, + y, + out_grad, + static_cast(this->Attr("axis")), + x_grad_p, + y_grad_p); + VLOG(3) << "Runing mul_grad composite func"; + this->RecoverOutputName(x_grad, x_grad_name); + this->RecoverOutputName(y_grad, y_grad_name); + } +}; + template class ElementwiseMulDoubleGradMaker : public framework::SingleGradOpMaker { public: @@ -123,7 +154,8 @@ REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseMulOpMaker, ops::ElementwiseOpInferVarType, ops::ElementwiseMulOpGradMaker, - ops::ElementwiseMulOpGradMaker); + ops::ElementwiseMulOpGradMaker, + ops::ElementwiseMulGradCompositeOpMaker); REGISTER_OPERATOR( elementwise_mul_grad, ops::ElementwiseOpGrad, diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index cbd322f38767e9e3598a90ae28c2cd172c41552d..9a867c040fcb8ea3a238ce340ab2f5234a7424d1 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -20,6 +20,9 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h" +#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" +#include "paddle/fluid/prim/utils/static/desc_tensor.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" @@ -190,6 +193,24 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker { } }; +class ExpandV2GradCompositeOpMaker : public prim::GradCompositeOpMakerBase { + using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase; + + public: + void Apply() override { + auto x = this->GetSingleForwardInput("X"); + auto out_grad = this->GetSingleOutputGrad("Out"); + auto x_grad = this->GetSingleInputGrad("X"); + auto x_grad_p = this->GetOutputPtr(&x_grad); + auto x_grad_name = this->GetOutputName(x_grad); + auto shape = this->Attr>("shape"); + prim::expand_grad( + x, out_grad, paddle::experimental::IntArray(shape), x_grad_p); + VLOG(3) << "Runing expand_v2 composite func"; + this->RecoverOutputName(x_grad, x_grad_name); + } +}; + template class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker { public: @@ -223,6 +244,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker, + ops::ExpandV2GradCompositeOpMaker, ops::ExpandV2GradOpMaker, ops::ExpandV2GradOpMaker, ExpandInferShapeFunctor); diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py index c16be45f8f1ecda325d962f511c9750d62e61f27..e4bb7041016d21f36764fc7cce0674c56f5f1b3d 100644 --- a/paddle/fluid/operators/generator/generate_op.py +++ b/paddle/fluid/operators/generator/generate_op.py @@ -256,6 +256,16 @@ def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict): op_item['no_need_buffer'] = get_param_list_alias( op_item['no_need_buffer'], args_map ) + if 'data_transform' in op_item and op_item['data_transform']: + data_trans_item = op_item['data_transform'] + if 'skip_transform' in data_trans_item: + data_trans_item['skip_transform'] = get_param_list_alias( + data_trans_item['skip_transform'], args_map + ) + if 'support_trans_dtype' in data_trans_item: + data_trans_item['support_trans_dtype'] = get_param_list_alias( + data_trans_item['support_trans_dtype'], args_map + ) process_scalar(op_item, scalar_configs) process_int_array(op_item, int_array_configs) diff --git a/paddle/fluid/operators/generator/parse_utils.py b/paddle/fluid/operators/generator/parse_utils.py index 7d153ff868eeb8285df9c2ea5b35930d7d8f3859..d5a58a2a94a0ef67ca643c846c577b971ede4a49 100644 --- a/paddle/fluid/operators/generator/parse_utils.py +++ b/paddle/fluid/operators/generator/parse_utils.py @@ -427,7 +427,41 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"): else: no_buffer_args = None - # TODO(chenfeiyu): data_transform + # add data_transform tag for every input. + # the format is {data_transform : {skip_transform : [x, z], support_trans_dtype : y}} + for input in inputs: + input["data_transform"] = {} + if "data_transform" in op_entry: + skip_trans_args = [] + support_trans_args = [] + data_trans = op_entry["data_transform"] + if "skip_transform" in data_trans: + skip_trans_args = parse_plain_list(data_trans["skip_transform"]) + for name in skip_trans_args: + assert ( + name in input_names + ), f"{op_name} has an skip_transform input: '{name}' which is not an input." + data_trans["skip_transform"] = skip_trans_args + if "support_trans_dtype" in data_trans: + support_trans_args = parse_plain_list( + data_trans["support_trans_dtype"] + ) + for name in support_trans_args: + assert ( + name in input_names + ), f"{op_name} has an support_trans_dtype input: '{name}' which is not an input." + data_trans["support_trans_dtype"] = support_trans_args + for input in inputs: + if input["name"] in skip_trans_args: + input["data_transform"]["skip_trans_args"] = True + else: + input["data_transform"]["skip_trans_args"] = False + if input["name"] in support_trans_args: + input["data_transform"]["support_trans_dtype"] = True + else: + input["data_transform"]["support_trans_dtype"] = False + else: + data_trans = None op = { "name": op_name, @@ -435,6 +469,7 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"): "attrs": attrs, "outputs": outputs, "no_need_buffer": no_buffer_args, + "data_transform": data_trans, } # invokes another op ? diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2 index 37b5075235a83d7ce09f0c6fa0ebcf0463fc3cfb..000e56453d934f248ab6e427c722b997bb1d0032 100644 --- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 +++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2 @@ -117,6 +117,15 @@ static_cast(phi::Place({{"phi::" if not default_value is initializer_list}} {# --------------------------------------- name mapping ---------------------------------------------- #} {% macro name_map(op) %} +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping: + +{{op | cartesian_prod_mapping}} +****************************************************************** +*/ + KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) { {% set kernel_args = op["kernel"]["param"] %} {{get_input_list(op["inputs"], kernel_args)}}; @@ -136,15 +145,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum return sig; {%endif%} } - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping: - -{{op | cartesian_prod_mapping}} -****************************************************************** -*/ {% endmacro %} {% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #} @@ -172,6 +172,15 @@ ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}} {%- endmacro %} {% macro sparse_op_name_map(op) %} +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping: + +{{op | cartesian_prod_mapping}} +****************************************************************** +*/ + KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) { {% set kernel_args = op["kernel"]["param"] %} {{get_input_list(op["inputs"], kernel_args)}}; @@ -188,15 +197,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); return sig; } - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping: - -{{op | cartesian_prod_mapping}} -****************************************************************** -*/ {% endmacro %} {% macro register_base_kernel_name(op) %} @@ -284,6 +284,32 @@ phi::KernelKey GetExpectedKernelType( } {% endmacro %} +{% macro get_kernel_for_var(op) %} {# only for data_transform #} +{% set skip_args = op["data_transform"]["skip_transform"] %} +{% set var_name = "var_name" %} +{% set skip_args_len = skip_args | length %} +phi::KernelKey GetKernelTypeForVar( + const std::string& {{var_name}}, + const phi::DenseTensor& tensor, + const phi::KernelKey& expected_kernel_type) const override { + + if ( + {%- for skip_arg in skip_args -%} + var_name == "{{ skip_arg }}" + {%- if skip_args_len != 1 and loop.index != skip_args_len %} || {% endif -%} + {%- endfor -%} + ){ + return phi::KernelKey(phi::Backend::ALL_BACKEND, + expected_kernel_type.layout(), + expected_kernel_type.dtype()); + } + else{ + return phi::KernelKey( + tensor.place(), tensor.layout(), expected_kernel_type.dtype()); + } + } +{% endmacro %} + {# --------------------------------------- operator ---------------------------------------------- #} {% macro operator(op) %} class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel { @@ -293,9 +319,17 @@ class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKerne {% set kernel = op["kernel"] %} {% if kernel["data_type"] is not none %} protected: - {% filter indent(2, True)%} + {% filter indent(2, True)%} {{get_expected_kernel(op)}} - {% endfilter %} + {% endfilter %} + {%- if "data_transform" in op and op["data_transform"] is not none -%} + {%- if "skip_transform" in op["data_transform"] -%} + {% filter indent(2, True) %} +{{get_kernel_for_var(op)}} + {% endfilter %} + {%- endif %} + {%- endif -%} +{# TODO(lizhiyu): add the 'support_trans_dtype' #} {% endif %} }; diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc deleted file mode 100644 index afdbaf0ca7729e6fff508127e1f0bdd77e383311..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/ternary.h" - -namespace paddle { -namespace operators { - -class GraphSendRecvOP : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context().GetPlace()); - } -}; - -class GraphSendRecvGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim(framework::GradVarName("X"), in_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context().GetPlace()); - } -}; - -class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "The input tensor with data type float32, float64, int32, int64."); - AddInput("Src_index", "The source index tensor."); - AddInput("Dst_index", "The destination index tensor."); - AddInput("Out_size", - "(Tensor, optional). The 0th dimension of the output." - "It has a higher priority than Attr(out_size).") - .AsDispensable(); - AddOutput("Out", "Output tensor of graph_send_recv op."); - AddOutput("Dst_count", - "Count tensor of Dst_index, mainly for MEAN reduce_op.") - .AsIntermediate(); - AddAttr("reduce_op", - "(string, default 'SUM')" - "Define different pool types to receive the result " - "tensors of Dst_index.") - .SetDefault("SUM") - .InEnum({"SUM", "MEAN", "MIN", "MAX"}); - AddAttr>( - "out_size", - "(vector, default {0})" - "Define the first dimension of Output tensor." - "If set default {0}, then the shape of Out is the same with X.") - .SetDefault({0}); - AddComment(R"DOC( -Graph Learning Send_Recv combine operator. - -$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$ - -This operator is mainly used in Graph Learning domain, and the main purpose is to reduce -intermediate memory consumption in the process of message passing. -Take `x` as the input tensor, we first use `src_index` to gather corresponding data, -and then use `dst_index` to update the corresponding position of output tensor in different -pooling types, like sum, mean, max, or min. - -)DOC"); - } -}; - -template -class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("graph_send_recv_grad"); - op->SetInput("Src_index", this->Input("Src_index")); - op->SetInput("Dst_index", this->Input("Dst_index")); - op->SetInput("X", this->Input("X")); - - if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") { - op->SetInput("Dst_count", this->Output("Dst_count")); - } - - if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" || - PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") { - op->SetInput("Out", this->Output("Out")); - } - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, - GraphSendRecvInferShapeFunctor, - PD_INFER_META(phi::SendURecvInferMeta)); -REGISTER_OPERATOR(graph_send_recv, - ops::GraphSendRecvOP, - ops::GraphSendRecvOpMaker, - ops::GraphSendRecvGradOpMaker, - ops::GraphSendRecvGradOpMaker, - GraphSendRecvInferShapeFunctor); -REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc deleted file mode 100644 index 2a252bcf70368cd53bf2b3f597fb281f5e52f148..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_ue_recv_op.cc +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/multiary.h" - -namespace paddle { -namespace operators { - -class GraphSendUERecvOP : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context().GetPlace()); - } -}; - -class GraphSendUERecvGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim(framework::GradVarName("X"), in_dims); - auto y_dims = ctx->GetInputDim("Y"); - ctx->SetOutputDim(framework::GradVarName("Y"), y_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context().GetPlace()); - } -}; - -class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "The input tensor with data type float32, float64, int32, int64."); - AddInput("Y", - "The input edge weight tensor, data type should be same with X"); - AddInput("Src_index", "The source index tensor."); - AddInput("Dst_index", "The destination index tensor."); - AddInput("Out_size", - "(Tensor, optional). The 0th dimension of the output." - "It has a higher priority than Attr(out_size).") - .AsDispensable(); - AddOutput("Out", "Output tensor of graph_send_ue_recv op."); - AddOutput("Dst_count", - "Count tensor of Dst_index, mainly for MEAN reduce_op.") - .AsIntermediate(); - AddAttr("message_op", - "(string, default 'ADD')" - "Define differenct computation types between X and E.") - .SetDefault("ADD") - .InEnum({"ADD", "MUL"}); - AddAttr("reduce_op", - "(string, default 'SUM')" - "Define different pool types to receive the result " - "tensors of Dst_index.") - .SetDefault("SUM") - .InEnum({"SUM", "MEAN", "MIN", "MAX"}); - AddAttr>( - "out_size", - "(vector, default {0})" - "Define the first dimension of Output tensor." - "If set default {0}, then the shape of Out is the same with X.") - .SetDefault({0}); - AddComment(R"DOC( -Graph Learning Send_UE_Recv combine operator. - -$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$ - -This operator is mainly used in Graph Learning domain, and the main purpose is to reduce -intermediate memory consumption in the process of message passing. - -Take `X` as the input tensor, we first use `src_index` to gather corresponding data. -Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div, -and get the computation result. Then, use `dst_index` to update the corresponding position of output -tensor in different pooling types, like sum, mean, max, or min. - -)DOC"); - } -}; - -template -class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("graph_send_ue_recv_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Y", this->Input("Y")); - op->SetInput("Src_index", this->Input("Src_index")); - op->SetInput("Dst_index", this->Input("Dst_index")); - - if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") { - op->SetInput("Dst_count", this->Output("Dst_count")); - } - - if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" || - PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") { - op->SetInput("Out", this->Output("Out")); - } - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv, - GraphSendUERecvInferShapeFunctor, - PD_INFER_META(phi::SendUERecvInferMeta)); -REGISTER_OPERATOR(graph_send_ue_recv, - ops::GraphSendUERecvOP, - ops::GraphSendUERecvOpMaker, - ops::GraphSendUERecvGradOpMaker, - ops::GraphSendUERecvGradOpMaker, - GraphSendUERecvInferShapeFunctor); -REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index afef765c6ff71ce9f1a97e915d4f933558c138d2..2b337887faa3f8fe2dde678fa6caec12256adeb3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase { // get output orginal name std::string x_grad_name = this->GetOutputName(x_grad_t); - + VLOG(3) << "Runing sum_grad composite func"; // call composite backward func prim::sum_grad( x, out_grad, axis, keep_dim, reduce_all, x_grad); diff --git a/paddle/fluid/operators/select_output_op.cc b/paddle/fluid/operators/select_output_op.cc index f57933bab0c0b8b18334bc6e9971e663275c3a3b..e70c59b20a64f02ae2d15787a56873c5d7c7f9ed 100644 --- a/paddle/fluid/operators/select_output_op.cc +++ b/paddle/fluid/operators/select_output_op.cc @@ -95,7 +95,8 @@ class SelectOutputInferShape : public framework::InferShapeBase { void operator()(framework::InferShapeContext *context) const override { OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput"); OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput"); - OP_INOUT_CHECK(context->HasOutputs("Out"), "Output", "Out", "SelectOutput"); + OP_INOUT_CHECK( + context->HasOutputs("Out", true), "Output", "Out", "SelectOutput"); } }; diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc deleted file mode 100644 index 695807a4c3c6e529c4b8222ef410253ca69b2d09..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/size_op.cc +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class SizeOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto dtype = framework::proto::VarType::FP32; // dtype is not important - return phi::KernelKey(dtype, ctx.GetPlace()); - } - - phi::KernelKey GetKernelTypeForVar( - const std::string& var_name, - const phi::DenseTensor& tensor, - const phi::KernelKey& expected_kernel_type) const override { - return phi::KernelKey(phi::Backend::ALL_BACKEND, - expected_kernel_type.layout(), - expected_kernel_type.dtype()); - } -}; - -class SizeOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Input", "The input tensor."); - AddOutput("Out", - "The returned tensor, the data type " - "is int64_t, will be on the same device with the input Tensor."); - AddComment(R"DOC( -Size Operator. - -Return the number of elements in the input. -)DOC"); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(SizeOpNoNeedBufferVarInferer, "Input"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(size, - SizeInferShapeFunctor, - PD_INFER_META(phi::NumelInferMeta)); -REGISTER_OPERATOR( - size, - ops::SizeOp, - ops::SizeOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - SizeInferShapeFunctor, - ops::SizeOpNoNeedBufferVarInferer); diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc index 35128b0085687e8b8e9f5d27d2cd03c364d2928e..97bdd2784e9ca5efe05398c0d244aacd80e25473 100644 --- a/paddle/fluid/operators/string/faster_tokenizer_op.cc +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -407,8 +407,8 @@ int BertTokenizer::Encode( void BertTokenizer::BatchEncode( vector>>* batch_encode_inputs, - const vector& batch_text, - const vector& batch_text_pair /* = vector() */, + const framework::Strings& batch_text, + const framework::Strings& batch_text_pair /* = vector() */, bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */, bool pad_to_max_seq_len /* = false */) const { diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h index 0c9819025241949ffcdd5e751ca2d74244ddbcf0..789d54852c27e6acea207a177826754a9383ca0b 100644 --- a/paddle/fluid/operators/string/faster_tokenizer_op.h +++ b/paddle/fluid/operators/string/faster_tokenizer_op.h @@ -100,8 +100,8 @@ class BertTokenizer { bool pad_to_max_seq_len = false) const; void BatchEncode( vector>>* batch_encode_inputs, - const vector& batch_text, - const vector& batch_text_pair = vector(), + const framework::Strings& batch_text, + const framework::Strings& batch_text_pair = framework::Strings(), bool is_split_into_words = false, const size_t max_seq_len = 0, bool pad_to_max_seq_len = false) const; @@ -162,7 +162,7 @@ class FasterTokenizerKernel : public framework::OpKernel { } else { tokenizer.BatchEncode(&batch_encode_inputs, *text, - vector(), + framework::Strings(), is_split_into_words, max_seq_len, pad_to_max_seq_len); diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 7a5acb762eb83bbc52254cc2427938a1c8f0ba39..2f14a23168533cfdf34072b30a26b186d039d2c1 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -18,6 +18,7 @@ #include "paddle/phi/backends/all_context.h" DECLARE_bool(use_stream_safe_cuda_allocator); +DECLARE_bool(new_executor_use_cuda_graph); namespace paddle { namespace platform { @@ -43,7 +44,10 @@ void BeginCUDAGraphCapture(phi::GPUPlace place, auto stream = dev_ctx->stream(); CUDAGraph::BeginCapture(place, stream, mode); - auto old_value = FLAGS_use_stream_safe_cuda_allocator; + // When using cuda graph in new executor, fast GC must be used. + // FLAGS_use_stream_safe_cuda_allocator should be true. + auto old_value = FLAGS_use_stream_safe_cuda_allocator && + !FLAGS_new_executor_use_cuda_graph; if (old_value) { FLAGS_use_stream_safe_cuda_allocator = false; } diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index cebb36cbc6462ba05c484d84b843b3377a2af149..a49d9013fb6d0c50fdddc09b9da104355f37b4b0 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -20,6 +20,7 @@ namespace platform { void CudaProfilerInit(const std::string& output_file, const std::string& output_mode, const std::string& config_file) { +#if CUDA_VERSION < 11000 PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv", platform::errors::InvalidArgument( "Unsupported cuda profiler output mode, expect `kvp` or " @@ -28,6 +29,7 @@ void CudaProfilerInit(const std::string& output_file, cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE_GPU_SUCCESS( cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); +#endif } void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 2c90acd6100980ec40d42393c8b8af5472b1f510..94236fcff1a4381bde7a30b136d415bee81e43e2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -183,6 +183,7 @@ class XPUDeviceContext : public phi::XPUContext { virtual ~XPUDeviceContext(); Eigen::DefaultDevice* eigen_device() const { return nullptr; } xpuStream stream() const { return XPUContext::x_context()->xpu_stream; } + void CreateStream() { XPUContext::CreateStream(); } }; template <> diff --git a/paddle/fluid/prim/api/.gitignore b/paddle/fluid/prim/api/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..377e800f00a0e08893961c1910c9b479e3143181 --- /dev/null +++ b/paddle/fluid/prim/api/.gitignore @@ -0,0 +1,3 @@ +generated/prim_api/eager_prim_api.cc +generated/prim_api/tmp_eager_prim_api.cc +generated/prim_api/*.h diff --git a/paddle/fluid/prim/api/CMakeLists.txt b/paddle/fluid/prim/api/CMakeLists.txt index 534ddec6b5c3cbd2e04a15c23d433444a3208b3c..436cecc32582b39cfe08b2a06f9d4dba55387f50 100644 --- a/paddle/fluid/prim/api/CMakeLists.txt +++ b/paddle/fluid/prim/api/CMakeLists.txt @@ -1,4 +1,7 @@ +add_subdirectory(auto_code_generated) add_subdirectory(manual) +add_subdirectory(generated) + if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( prim_api diff --git a/paddle/fluid/prim/api/all.h b/paddle/fluid/prim/api/all.h index 308eb91b4f11797c6e9a826b9d290eab951b7cf0..2996d2aa2657c8b8c09cfabd30daa7c2adf707b6 100644 --- a/paddle/fluid/prim/api/all.h +++ b/paddle/fluid/prim/api/all.h @@ -13,6 +13,6 @@ // limitations under the License. #pragma once +#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h" #include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h" -#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h" #include "paddle/fluid/prim/api/manual/utils/utils.h" diff --git a/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e36af681bbd89589d58e5a7003beacb83ff08c24 --- /dev/null +++ b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt @@ -0,0 +1,38 @@ +set(api_yaml_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml" +) +set(legacy_api_yaml_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml" +) +set(tmp_eager_prim_api_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_eager_prim_api.cc" +) +set(tmp_prim_api_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_prim_api.h" +) +set(eager_prim_api_cc_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc" +) +set(prim_api_h_path + "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/prim_api.h") +set(prim_api_gen_file + ${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated/prim_gen.py) + +message("prim api Code gen") +execute_process( + WORKING_DIRECTORY + ${CMAKE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated + COMMAND + ${PYTHON_EXECUTABLE} ${prim_api_gen_file} --api_yaml_path + ${legacy_api_yaml_path} ${api_yaml_path} --prim_api_header_path + ${tmp_prim_api_h_path} --eager_prim_api_source_path + ${tmp_eager_prim_api_cc_path} + RESULT_VARIABLE _result) +if(${_result}) + message(FATAL_ERROR "prim api genrate failed, exiting.") +endif() +execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${tmp_prim_api_h_path} ${prim_api_h_path}) +execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${tmp_eager_prim_api_cc_path} ${eager_prim_api_cc_path}) +message("copy tmp_xxx_prim_api to xxx_prim_api") diff --git a/paddle/fluid/prim/api/auto_code_generated/prim_base.py b/paddle/fluid/prim/api/auto_code_generated/prim_base.py new file mode 100644 index 0000000000000000000000000000000000000000..d1ad94a7c3e40d66f402e4daa84fa76cbd7e728f --- /dev/null +++ b/paddle/fluid/prim/api/auto_code_generated/prim_base.py @@ -0,0 +1,342 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# prim api list +white_ops_list = [ + "pow", + "scale", + "multiply", + "unsqueeze", + "expand", + "full", + "reshape", + "divide", + "sum", + "exp", +] + +inplace_out_type_map = { + "Tensor": "Tensor&", + "std::vector": "std::vector&", +} + +inplace_optional_out_type_map = { + "Tensor": "paddle::optional&", + "std::vector": "paddle::optional>&", +} + + +class BaseAPI: + def __init__(self, api_item_yaml): + # self.api = api_item_yaml['op'] + self.api = api_item_yaml['name'] + + self.is_prim_api = False + if api_item_yaml['name'] in white_ops_list: + self.is_prim_api = True + + ####################################### + # inputs: + # names : [], list of input names + # input_info : {input_name : type} + # attrs: + # names : [], list of attribute names + # attr_info : { attr_name : (type, default_values)} + # outputs: + # names : [], list of output names + # types : [], list of output types + # out_size_expr : [], expression for getting size of vector + ######################################## + if self.is_prim_api: + ( + self.inputs, + self.attrs, + self.outputs, + self.optional_vars, + ) = self.parse_args(self.api, api_item_yaml) + + self.inplace_map = api_item_yaml['inplace'] + + def get_api_func_name(self): + return self.api + + # def is_inplace(self): + # if self.inplace_map + # return True + # return False + + def get_input_tensor_args(self, inplace_flag=False): + input_args = [] + inplace_type_map = { + "const Tensor&": "Tensor&", + "const paddle::optional&": "paddle::optional&", + "const std::vector&": "std::vector&", + "const paddle::optional>&": "paddle::optional>&", + } + for name in self.inputs['names']: + name = name.split('@')[0] + if inplace_flag and name in self.inplace_map.values(): + input_args.append( + inplace_type_map[self.inputs['input_info'][name]] + + ' ' + + name + ) + else: + input_args.append(self.inputs['input_info'][name] + ' ' + name) + return input_args + + def get_declare_args(self, inplace_flag=False): + declare_args = self.get_input_tensor_args(inplace_flag) + for name in self.attrs['names']: + default_value = '' + if self.attrs['attr_info'][name][1] is not None: + default_value = ' = ' + self.attrs['attr_info'][name][1] + declare_args.append( + self.attrs['attr_info'][name][0] + ' ' + name + default_value + ) + + return ", ".join(declare_args) + + def get_declare_args_nodefault(self, inplace_flag=False): + declare_args = self.get_input_tensor_args(inplace_flag) + for name in self.attrs['names']: + declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name) + + return ", ".join(declare_args) + + def get_return_type(self, inplace_flag=False): + out_type_list = [] + for i, out_type in enumerate(self.outputs['types']): + out_name = self.outputs['names'][i].split('@')[0] + if inplace_flag and out_name in self.inplace_map: + if self.inplace_map[out_name] in self.optional_vars: + out_type_list.append( + inplace_optional_out_type_map[out_type] + ) + else: + out_type_list.append(inplace_out_type_map[out_type]) + else: + out_type_list.append(out_type) + if len(out_type_list) == 1: + return out_type_list[0] + else: + return "std::tuple<" + ", ".join(out_type_list) + ">" + + def parse_args(self, api_name, api_item_yaml): + optional_vars = [] + for input_dict in api_item_yaml['inputs']: + if input_dict['optional']: + optional_vars.append(input_dict['name']) + + inputs, attrs = self.parse_input_and_attr( + api_item_yaml['inputs'], api_item_yaml['attrs'] + ) + + output_type_list, output_names, out_size_expr = self.parse_output( + api_item_yaml['outputs'] + ) + return ( + inputs, + attrs, + { + 'names': output_names, + 'types': output_type_list, + 'out_size_expr': out_size_expr, + }, + optional_vars, + ) + + def parse_input_and_attr(self, inputs_list, attrs_list): + input_types_map = { + 'Tensor': 'const Tensor&', + 'Tensor[]': 'const std::vector&', + } + attr_types_map = { + 'IntArray': 'const IntArray&', + 'Scalar': 'const Scalar&', + 'Scalar(int)': 'const Scalar&', + 'Scalar(int64_t)': 'const Scalar&', + 'Scalar(float)': 'const Scalar&', + 'Scalar(dobule)': 'const Scalar&', + 'Scalar[]': 'const std::vector&', + 'int': 'int', + 'int32_t': 'int32_t', + 'int64_t': 'int64_t', + 'long': 'long', + 'size_t': 'size_t', + 'float': 'float', + 'float[]': 'const std::vector&', + 'double': 'double', + 'bool': 'bool', + 'bool[]': 'const std::vector&', + 'str': 'const std::string&', + 'str[]': 'const std::vector&', + 'Place': 'const Place&', + 'DataLayout': 'DataLayout', + 'DataType': 'DataType', + 'int64_t[]': 'const std::vector&', + 'int[]': 'const std::vector&', + } + optional_types_trans = { + 'Tensor': 'const paddle::optional&', + 'Tensor[]': 'const paddle::optional>&', + 'int': 'paddle::optional', + 'int32_t': 'paddle::optional', + 'int64_t': 'paddle::optional', + 'float': 'paddle::optional', + 'double': 'paddle::optional', + 'bool': 'paddle::optional', + 'Place': 'paddle::optional', + 'DataLayout': 'paddle::optional', + 'DataType': 'paddle::optional', + } + + inputs = {'names': [], 'input_info': {}} + for input_dict in inputs_list: + inputs['names'].append(input_dict['name']) + if input_dict['optional']: + inputs['input_info'][input_dict['name']] = optional_types_trans[ + input_dict['typename'] + ] + else: + inputs['input_info'][input_dict['name']] = input_types_map[ + input_dict['typename'] + ] + + attrs = {'names': [], 'attr_info': {}} + for attr_dict in attrs_list: + attrs['names'].append(attr_dict['name']) + if 'default_value' in attr_dict.keys(): + default_value = attr_dict['default_value'] + else: + default_value = None + + if 'optional' in attr_dict.keys(): + attrs['attr_info'][attr_dict['name']] = ( + optional_types_trans[attr_dict['typename']], + default_value, + ) + else: + attrs['attr_info'][attr_dict['name']] = ( + attr_types_map[attr_dict['typename']], + default_value, + ) + return inputs, attrs + + def parse_output(self, outputs_list): + + out_type_list = [] + out_name_list = [] + out_size_expr_list = [] + for output_dict in outputs_list: + if output_dict['intermediate']: + continue + out_type_list.append(output_dict['typename']) + out_name_list.append(output_dict['name']) + if 'size' in output_dict.keys(): + out_size_expr_list.append(output_dict['size']) + else: + out_size_expr_list.append(None) + return out_type_list, out_name_list, out_size_expr_list + + +class EagerPrimAPI(BaseAPI): + def __init__(self, api_item_yaml): + super().__init__(api_item_yaml) + + def get_api__func_name(self): + api_func_name = self.api + # if self.is_inplace: + # if api_func_name[-1] != '_': + # api_func_name += '_' + # print("after api name", api_func_name) + return api_func_name + + def gene_prim_api_declaration(self): + api_declaration = "" + api_func_name = self.get_api__func_name() + if api_func_name[-1] != '_': + api_declaration = f""" +template +{self.get_return_type()} {api_func_name}({self.get_declare_args()}); +""" + else: + api_declaration = ( + api_declaration + + f""" +template +{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)}); +""" + ) + + return api_declaration + + def get_ad_func_input_args(self, inplace_flag=False): + input_args = [] + for name in self.inputs['names']: + name = name.split('@')[0] + if inplace_flag and name in self.inplace_map.values(): + input_args.append(name) + else: + input_args.append(name) + return input_args + + def get_ad_func_args(self, inplace_flag=False): + ad_func_args = self.get_ad_func_input_args(inplace_flag) + for name in self.attrs['names']: + default_value = '' + if self.attrs['attr_info'][name][1] is not None: + default_value = ' = ' + self.attrs['attr_info'][name][1] + ad_func_args.append(name) + + ad_func_args_str = ", ".join(ad_func_args) + return ad_func_args_str + + def gene_ad_func_call(self): + api_func_name = self.get_api__func_name() + + dygraph_ad_func_name = '::' + api_func_name + '_ad_func' + dygraph_ad_func_parameters = self.get_ad_func_args() + + ad_func_call_str = f""" +VLOG(4) << "Eager Prim API {api_func_name}_ad_func call"; +return {dygraph_ad_func_name}({dygraph_ad_func_parameters}); +""" + # print("ad_func_call_str: ", ad_func_call_str) + return ad_func_call_str + + def gene_eager_prim_api_code(self): + api_code = "" + indent = " " + api_func_name = self.get_api__func_name() + template = '' + # func decalaration + if api_func_name[-1] != '_': + api_code = f""" +template <> +{self.get_return_type()} {api_func_name}{template}({self.get_declare_args_nodefault()}) +""" + else: + api_code = f""" +template <> +{self.get_return_type(inplace_flag=True)} {api_func_name}{template}({self.get_declare_args_nodefault(inplace_flag=True)}) +""" + # func code + + api_code = api_code + '{' + api_code += f"""{self.gene_ad_func_call()}""" + api_code += '}' + '\n' + + return api_code diff --git a/paddle/fluid/prim/api/auto_code_generated/prim_gen.py b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc59df4f33d2de7bdbf76737461f0b848865c36 --- /dev/null +++ b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py @@ -0,0 +1,132 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import yaml +from prim_base import EagerPrimAPI + + +def header_include(): + return """ +#include "paddle/phi/common/int_array.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/place.h" +#include "paddle/utils/optional.h" +""" + + +def eager_source_include(header_file_path): + return """ +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h" +""" + + +def api_namespace(): + return ( + """ +namespace paddle { +namespace prim { +""", + """ +using Tensor = paddle::experimental::Tensor; +using Scalar = paddle::experimental::Scalar; +using IntArray = paddle::experimental::IntArray; +using DataType = paddle::experimental::DataType; +""", + """ +} // namespace prim +} // namespace paddle +""", + ) + + +def generate_api(api_yaml_path, header_file_path, eager_prim_source_file_path): + apis = [] + + for each_api_yaml in api_yaml_path: + with open(each_api_yaml, 'r') as f: + api_list = yaml.load(f, Loader=yaml.FullLoader) + if api_list: + apis.extend(api_list) + + header_file = open(header_file_path, 'w') + eager_prim_source_file = open(eager_prim_source_file_path, 'w') + + namespace = api_namespace() + + header_file.write("#pragma once\n") + header_file.write(header_include()) + header_file.write(namespace[0]) + header_file.write(namespace[1]) + include_header_file = ( + "#include paddle/fluid/prim/api/generated/prim_api/prim_api.h" + ) + eager_prim_source_file.write(eager_source_include(include_header_file)) + eager_prim_source_file.write(namespace[0]) + + for api in apis: + prim_api = EagerPrimAPI(api) + if prim_api.is_prim_api: + header_file.write(prim_api.gene_prim_api_declaration()) + eager_prim_source_file.write(prim_api.gene_eager_prim_api_code()) + + header_file.write(namespace[2]) + eager_prim_source_file.write(namespace[2]) + + header_file.close() + eager_prim_source_file.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Generate PaddlePaddle C++ API files' + ) + parser.add_argument( + '--api_yaml_path', + help='path to api yaml file', + nargs='+', + default=['paddle/phi/api/yaml/ops.yaml'], + ) + + parser.add_argument( + '--prim_api_header_path', + help='output of generated prim_api header code file', + default='paddle/fluid/prim/api/generated/prim_api/prim_api.h', + ) + + parser.add_argument( + '--eager_prim_api_source_path', + help='output of generated eager_prim_api source code file', + default='paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc', + ) + + options = parser.parse_args() + + api_yaml_path = options.api_yaml_path + prim_api_header_file_path = options.prim_api_header_path + eager_prim_api_source_file_path = options.eager_prim_api_source_path + + generate_api( + api_yaml_path, + prim_api_header_file_path, + eager_prim_api_source_file_path, + ) + + +if __name__ == '__main__': + main() diff --git a/paddle/fluid/prim/api/generated/CMakeLists.txt b/paddle/fluid/prim/api/generated/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1b75527c20b49d688bde9ea120a74046a411123 --- /dev/null +++ b/paddle/fluid/prim/api/generated/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(prim_api) diff --git a/paddle/fluid/prim/api/manual/prim_api/CMakeLists.txt b/paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt similarity index 100% rename from paddle/fluid/prim/api/manual/prim_api/CMakeLists.txt rename to paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt diff --git a/paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc b/paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc similarity index 85% rename from paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc rename to paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc index 0bf14b5955ba5c028d40eb38d6387a9a233e592e..fd309750ed6014048421d370501bad0a1fe71eff 100644 --- a/paddle/fluid/prim/api/manual/prim_api/static_prim_api.cc +++ b/paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h" #include "paddle/fluid/prim/api/manual/prim_api/prim_api.h" #include "paddle/fluid/prim/api/manual/utils/utils.h" #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" @@ -37,7 +38,7 @@ namespace paddle { namespace prim { template <> -Tensor pow(const Tensor& x, const paddle::experimental::Scalar& y) { +Tensor pow(const Tensor& x, const Scalar& y) { Tensor out = empty({}, phi::DataType::FLOAT32, paddle::Place()); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::OpDesc* op = block->AppendOp(); @@ -55,7 +56,7 @@ Tensor pow(const Tensor& x, const paddle::experimental::Scalar& y) { template <> Tensor scale(const Tensor& x, - const paddle::experimental::Scalar& scale, + const Scalar& scale, float bias, bool bias_after_scale) { Tensor out = empty({}, phi::DataType::FLOAT32, paddle::Place()); @@ -95,63 +96,63 @@ Tensor multiply(const Tensor& x, const Tensor& y) { } template <> -Tensor expand(const Tensor& x, const IntArray& shape) { +Tensor unsqueeze(const Tensor& x, const IntArray& axis) { Tensor out = empty({}, phi::DataType::FLOAT32, paddle::Place()); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::OpDesc* op = block->AppendOp(); - op->SetType("expand_v2"); + op->SetType("unsqueeze2"); op->SetInput("X", {std::static_pointer_cast(x.impl())->Name()}); op->SetOutput( "Out", {std::static_pointer_cast(out.impl())->Name()}); - std::vector new_shape(shape.GetData().begin(), shape.GetData().end()); - op->SetAttr("shape", new_shape); + std::vector new_shape(axis.GetData().begin(), axis.GetData().end()); + op->SetAttr("axes", new_shape); op->CheckAttrs(); op->InferVarType(block); return out; } template <> -Tensor divide(const Tensor& x, const Tensor& y) { - // Grad infershape +Tensor expand(const Tensor& x, const IntArray& shape) { Tensor out = empty({}, phi::DataType::FLOAT32, paddle::Place()); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::OpDesc* op = block->AppendOp(); - op->SetType("elementwise_div"); + op->SetType("expand_v2"); op->SetInput("X", {std::static_pointer_cast(x.impl())->Name()}); - op->SetInput("Y", - {std::static_pointer_cast(y.impl())->Name()}); op->SetOutput( "Out", {std::static_pointer_cast(out.impl())->Name()}); + std::vector new_shape(shape.GetData().begin(), shape.GetData().end()); + op->SetAttr("shape", new_shape); op->CheckAttrs(); op->InferVarType(block); - op->InferShape(*block); return out; } template <> -Tensor unsqueeze(const Tensor& x, const IntArray& axis) { +Tensor divide(const Tensor& x, const Tensor& y) { + // Grad infershape Tensor out = empty({}, phi::DataType::FLOAT32, paddle::Place()); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::OpDesc* op = block->AppendOp(); - op->SetType("unsqueeze2"); + op->SetType("elementwise_div"); op->SetInput("X", {std::static_pointer_cast(x.impl())->Name()}); + op->SetInput("Y", + {std::static_pointer_cast(y.impl())->Name()}); op->SetOutput( "Out", {std::static_pointer_cast(out.impl())->Name()}); - std::vector new_shape(axis.GetData().begin(), axis.GetData().end()); - op->SetAttr("axes", new_shape); op->CheckAttrs(); op->InferVarType(block); + op->InferShape(*block); return out; } template <> -Tensor full(paddle::experimental::IntArray shape, - paddle::experimental::Scalar value, - paddle::experimental::DataType dtype, - paddle::platform::Place place) { +Tensor full(const IntArray& shape, + const Scalar& value, + DataType dtype, + const Place& place) { // Grad infershape Tensor out = empty({}, dtype, place); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); @@ -159,9 +160,8 @@ Tensor full(paddle::experimental::IntArray shape, op->SetType("fill_constant"); op->SetAttr("shape", shape.GetData()); PADDLE_ENFORCE_EQ( - ((dtype == paddle::experimental::DataType::FLOAT32) || - (dtype == paddle::experimental::DataType::FLOAT64) || - (dtype == paddle::experimental::DataType::FLOAT16)), + ((dtype == DataType::FLOAT32) || (dtype == DataType::FLOAT64) || + (dtype == DataType::FLOAT16)), true, phi::errors::InvalidArgument( "We only support float32/float16 for full, but we got data type: %s", @@ -177,9 +177,9 @@ Tensor full(paddle::experimental::IntArray shape, } template <> -Tensor sum(Tensor x, - paddle::experimental::IntArray axis, - paddle::experimental::DataType dtype, +Tensor sum(const Tensor& x, + const IntArray& axis, + DataType dtype, bool keepdim) { // Grad infershape Tensor out = empty({}, dtype, paddle::Place()); @@ -199,12 +199,12 @@ Tensor sum(Tensor x, "Out", {std::static_pointer_cast(out.impl())->Name()}); op->CheckAttrs(); op->InferVarType(block); - // TODO(jiabin): This may have runtime shape skip infershape for now. + // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now. return out; } template <> -Tensor reshape(Tensor x, paddle::experimental::IntArray shape) { +Tensor reshape(const Tensor& x, const IntArray& shape) { // Grad infershape Tensor out = empty({}, x.dtype(), paddle::Place()); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); @@ -222,7 +222,23 @@ Tensor reshape(Tensor x, paddle::experimental::IntArray shape) { "Out", {std::static_pointer_cast(out.impl())->Name()}); op->CheckAttrs(); op->InferVarType(block); - // TODO(jiabin): This may have runtime shape skip infershape for now. + // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now. + return out; +} + +template <> +Tensor exp(const Tensor& x) { + Tensor out = empty({}, phi::DataType::FLOAT32, paddle::Place()); + framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); + framework::OpDesc* op = block->AppendOp(); + op->SetType("exp"); + op->SetInput("X", + {std::static_pointer_cast(x.impl())->Name()}); + op->SetOutput( + "Out", {std::static_pointer_cast(out.impl())->Name()}); + op->CheckAttrs(); + op->InferVarType(block); + op->InferShape(*block); return out; } } // namespace prim diff --git a/paddle/fluid/prim/api/manual/CMakeLists.txt b/paddle/fluid/prim/api/manual/CMakeLists.txt index 261f6dd486302c71ee7dbfbf2b4dd0d66ca81772..512d2b1553c8c94a06445f3c59c4b77d10d74032 100644 --- a/paddle/fluid/prim/api/manual/CMakeLists.txt +++ b/paddle/fluid/prim/api/manual/CMakeLists.txt @@ -1,2 +1 @@ -add_subdirectory(prim_api) add_subdirectory(utils) diff --git a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h b/paddle/fluid/prim/api/manual/backward/composite_backward_api.h index fc276842b815ec8f1e8f36b3d9e4d8e73737ee73..31e09b34f16df11e82541d99795f07d5073aa538 100644 --- a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/manual/backward/composite_backward_api.h @@ -49,7 +49,7 @@ void subtract_grad(const Tensor& x, sum(scale_out_grad, phi::vectorize(reduce_dim), y.dtype(), false); auto dy_tmp = reshape(dy_reduce_res, phi::vectorize(y.dims())); set_output(dy_tmp, dy); - // dy->set_impl(dy_tmp.impl()); + } else { by_pass(scale_out_grad, dy); } @@ -62,7 +62,6 @@ void subtract_grad(const Tensor& x, sum(out_grad, phi::vectorize(reduce_dim), x.dtype(), false); auto dx_tmp = reshape(dx_reduce_res, phi::vectorize(x.dims())); set_output(dx_tmp, dx); - // dx->set_impl(dx_tmp.impl()); } else { by_pass(out_grad, dx); } @@ -84,7 +83,6 @@ void add_grad(const Tensor& x, sum(out_grad, phi::vectorize(reduce_dim), y.dtype(), false); auto dy_tmp = reshape(dy_reduce_res, phi::vectorize(y.dims())); set_output(dy_tmp, dy); - // dy->set_impl(dy_tmp.impl()); } else { by_pass(out_grad, dy); } @@ -97,7 +95,6 @@ void add_grad(const Tensor& x, sum(out_grad, phi::vectorize(reduce_dim), x.dtype(), false); auto dx_tmp = reshape(dx_reduce_res, phi::vectorize(x.dims())); set_output(dx_tmp, dx); - // dx->set_impl(dx_tmp.impl()); } else { by_pass(out_grad, dx); } @@ -139,7 +136,6 @@ void sum_grad(const Tensor& x, x_grad_tmp = expand(out_grad, x_dim); } set_output(x_grad_tmp, x_grad); - // x_grad->set_impl(x_grad_tmp.impl()); } template @@ -163,10 +159,8 @@ void divide_grad(const Tensor& x, sum(dy_res, phi::vectorize(reduce_dim), y.dtype(), false); auto dy_tmp = reshape(dy_reduce_res, phi::vectorize(y.dims())); set_output(dy_tmp, dy); - // dy->set_impl(dy_tmp.impl()); } else { set_output(dy_res, dy); - // dy->set_impl(dy_res.impl()); } } // indicate we will compute dy if (dx) { @@ -181,10 +175,8 @@ void divide_grad(const Tensor& x, sum(dx_res, phi::vectorize(reduce_dim), x.dtype(), false); auto dx_tmp = reshape(dx_reduce_res, phi::vectorize(x.dims())); set_output(dx_tmp, dx); - // dx->set_impl(dx_tmp.impl()); } else { set_output(dx_res, dx); - // dx->set_impl(dx_res.impl()); } } // indicate we will compute dx } @@ -196,7 +188,6 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) { auto tmp = divide(div_x, out); auto x_grad_tmp = multiply(out_grad, tmp); set_output(x_grad_tmp, x_grad); - // x_grad->set_impl(x_grad_tmp.impl()); } } } // namespace prim diff --git a/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc b/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc deleted file mode 100644 index 7dac02ea5b203e45adf5166602d6b41d3752194f..0000000000000000000000000000000000000000 --- a/paddle/fluid/prim/api/manual/prim_api/eager_prim_api.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/eager/api/all.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h" -#include "paddle/phi/capi/include/wrapper_base.h" -namespace paddle { -namespace prim { -template <> -Tensor pow(const Tensor& x, const paddle::experimental::Scalar& y) { - return ::pow_ad_func(x, y); -} - -template <> -Tensor scale(const Tensor& x, - const paddle::experimental::Scalar& scale, - float bias, - bool bias_after_scale) { - return ::scale_ad_func(x, scale, bias, bias_after_scale); -} - -template <> -Tensor multiply(const Tensor& x, const Tensor& y) { - return ::multiply_ad_func(x, y); -} - -template <> -Tensor expand(const Tensor& x, const IntArray& shape) { - return ::expand_ad_func(x, shape); -} - -template <> -Tensor unsqueeze(const Tensor& x, const IntArray& axis) { - return ::unsqueeze_ad_func(x, axis); -} - -template <> -Tensor divide(const Tensor& x, const Tensor& y) { - return ::divide_ad_func(x, y); -} - -template <> -Tensor full(paddle::experimental::IntArray shape, - paddle::experimental::Scalar value, - paddle::experimental::DataType dtype, - paddle::platform::Place place) { - return ::full_ad_func(shape, value, dtype, place); -} -template <> -Tensor sum(Tensor x, IntArray axis, DataType dtype, bool keepdim) { - return ::sum_ad_func(x, axis, dtype, keepdim); -} - -template <> -Tensor reshape(Tensor x, IntArray shape) { - return ::reshape_ad_func(x, shape); -} -} // namespace prim -} // namespace paddle diff --git a/paddle/fluid/prim/api/manual/prim_api/prim_api.h b/paddle/fluid/prim/api/manual/prim_api/prim_api.h index 5465cdb601e9557be56ddd8efd5640ae95abbc19..65d411d86307ded238a4bc07e6336659663ca406 100644 --- a/paddle/fluid/prim/api/manual/prim_api/prim_api.h +++ b/paddle/fluid/prim/api/manual/prim_api/prim_api.h @@ -12,50 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +// prim api which can't be generated #pragma once + +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/int_array.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/common/scalar.h" #include "paddle/utils/optional.h" -namespace paddle { -namespace prim { -using Tensor = paddle::experimental::Tensor; -using IntArray = paddle::experimental::IntArray; -using Scalar = paddle::experimental::Scalar; - -template -Tensor pow(const Tensor& x, const Scalar& y); - -template -Tensor scale(const Tensor& X, - const Scalar& scale, - float bias, - bool bias_after_scale); - -template -Tensor multiply(const Tensor& x, const Tensor& y); - -template -Tensor expand(const Tensor& x, const IntArray& shape); -template -Tensor unsqueeze(const Tensor& x, const IntArray& axis); - -template -Tensor divide(const Tensor& x, const Tensor& y); - -template -Tensor full(IntArray shape, - Scalar value, - DataType dtype = DataType::FLOAT32, - Place place = CPUPlace()); - -template -Tensor sum(Tensor x, - IntArray axis = {}, - DataType dtype = DataType::UNDEFINED, - bool keepdim = false); - -template -Tensor reshape(Tensor x, IntArray shape); -} // namespace prim +namespace paddle { +namespace prim {} // namespace prim } // namespace paddle diff --git a/paddle/fluid/prim/api/manual/utils/utils.h b/paddle/fluid/prim/api/manual/utils/utils.h index 69d879e37b0b2dd3271967cb91acb322f354af1a..20b02f2df9c79235d645f13a2a3cce8f8ff08d67 100644 --- a/paddle/fluid/prim/api/manual/utils/utils.h +++ b/paddle/fluid/prim/api/manual/utils/utils.h @@ -16,11 +16,12 @@ #include #include #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/ddim.h" -using IntArray = paddle::experimental::IntArray; + namespace paddle { namespace prim { // We put some api like utils here @@ -42,42 +43,40 @@ void set_output(const paddle::experimental::Tensor& x_tmp, paddle::experimental::Tensor* x); // These method don't need to be specified -static phi::DDim get_reduce_dims(const phi::DDim& x_dims, - const phi::DDim& y_dims) { +static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims, + const phi::DDim& in_dims) { std::vector result; - PADDLE_ENFORCE_GE(phi::product(x_dims), - phi::product(y_dims), - phi::errors::InvalidArgument( - "Only x_dims >= y_dims is accepted for " - "get_reduce_dims, but we got x_dims: %s, y_dims: %s", - x_dims, - y_dims)); - int bat = x_dims.size() - y_dims.size(); + int bat = dout_dims.size() - in_dims.size(); for (int i = 0; i < bat; ++i) { result.push_back(i); } - for (int i = 0; i < y_dims.size(); ++i) { - if (y_dims[i] == 1) { + for (int i = 0; i < in_dims.size(); ++i) { + if (in_dims[i] == 1) { result.push_back(i + bat); } else { PADDLE_ENFORCE_EQ( - y_dims[i], - x_dims[i + bat], + in_dims[i], + dout_dims[i + bat], platform::errors::InvalidArgument( "ReduceDims dimension mismatch. Operands could " - "not be broadcast together with the shape of x_dims = [%s] and " - "the shape of y_dims = [%s]. Received [%d] in X is not equal to " + "not be broadcast together with the shape of dout = [%s] and " + "the shape of in_dims = [%s]. Received [%d] in X is not equal to " "[%d] in Y at i:%d.", - x_dims, - y_dims, - x_dims[i + bat], - y_dims[i], + dout_dims, + in_dims, + dout_dims[i + bat], + in_dims[i], i)); } } - auto res_dims = phi::make_ddim(result); - VLOG(4) << "Reduce Dims is: " << res_dims; - return res_dims; + return phi::make_ddim(result); +} + +static phi::DDim get_reduce_dims(const phi::DDim& x_dims, + const phi::DDim& y_dims) { + auto out_dims = paddle::operators::details::BroadcastTwoDims(x_dims, y_dims); + return get_reduce_dims_from_out(out_dims, x_dims); } + } // namespace prim } // namespace paddle diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h index e053d1465e70d53a40920b73b9e7a959eef1b8dc..c2e7ca4ec57e2b11f6ce76a549408bcabdfbd1be 100644 --- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h +++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h @@ -477,6 +477,9 @@ class GradCompositeOpMakerBase { void RecoverOutputName(const paddle::experimental::Tensor& output, const std::string& origin_name) { if (origin_name == framework::kEmptyVarName) return; + VLOG(4) << "Recover: " + << static_cast(output.impl().get())->Name() + << " To: " << origin_name; prim::StaticCompositeContext::Instance().GetBlock()->RenameVar( static_cast(output.impl().get())->Name(), origin_name); diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h index c599346bdb7a89d2f0032cf0e4ecfe3da998026b..1deb20fbf9b88bf283c5f1af5ddecd95473fb541 100644 --- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h +++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h @@ -26,9 +26,9 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *kwargs) { PyThreadState *tstate = nullptr; try { - auto X = GetTensorListFromArgs("run_program", "X", args, 0, false); + auto X = GetTensorListFromArgs("run_program", "X", args, 0, true); auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true); - auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false); + auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true); auto OutScope = GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false); auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 4b9f2c295e9528ca4a9d9b5b6134668d7cb7ce2f..34132e199e7326059dddd3970ab571ccc52dd22d 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1503,7 +1503,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - using Strings = std::vector; + using Strings = paddle::framework::Strings; auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0); auto var_tensor = std::make_shared(); *var_tensor->GetMutable() = strings; diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index a86271dfbf532eb5d1a7d8e2ea932586e4cc3608..c328b1258cdeb7b49b486185fbbf5fb1c54e7163 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -184,39 +184,41 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) { value[i] = ddim[i]; } } - - auto desired_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - auto default_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); - bool change_dim = - (desired_layout != default_layout && - self->tensor.layout() == desired_layout && value.size() == 4); - VLOG(6) << "eager_properties 'Shape' method, layout autotune " - << " desired_layout: " << desired_layout - << " default_layout: " << default_layout - << " tensor layout: " << self->tensor.layout() - << " tensor's shape size is : " << value.size(); - std::vector dims = value; - if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") { - // NCHW -> NHWC - VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " " - << value[1] << " " << value[2] << " " << value[3] << " to " - << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1]; - value[0] = dims[0]; - value[1] = dims[2]; - value[2] = dims[3]; - value[3] = dims[1]; - } else if (change_dim && phi::DataLayoutToString(desired_layout) == "NHWC") { - // NHWC -> NCHW - VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " " - << value[1] << " " << value[2] << " " << value[3] << " to " - << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2] - << " " << dims[1]; - value[0] = dims[0]; - value[1] = dims[3]; - value[2] = dims[1]; - value[3] = dims[2]; + if (!egr::IsVariableCompatTensor(self->tensor)) { + auto desired_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); + auto default_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); + bool change_dim = + (desired_layout != default_layout && + self->tensor.layout() == desired_layout && value.size() == 4); + VLOG(6) << "eager_properties 'Shape' method, layout autotune " + << " desired_layout: " << desired_layout + << " default_layout: " << default_layout + << " tensor layout: " << self->tensor.layout() + << " tensor's shape size is : " << value.size(); + std::vector dims = value; + if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") { + // NCHW -> NHWC + VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] + << " " << value[1] << " " << value[2] << " " << value[3] << " to " + << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1]; + value[0] = dims[0]; + value[1] = dims[2]; + value[2] = dims[3]; + value[3] = dims[1]; + } else if (change_dim && + phi::DataLayoutToString(desired_layout) == "NHWC") { + // NHWC -> NCHW + VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] + << " " << value[1] << " " << value[2] << " " << value[3] << " to " + << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2] + << " " << dims[1]; + value[0] = dims[0]; + value[1] = dims[3]; + value[2] = dims[1]; + value[3] = dims[2]; + } } return ToPyObject(value); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 9262fec62b989d3643904cc33658a68f795c5f74..e01044720571d088899a8e60b02b6bd8bb5304c1 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) { m.def("_cleanup_mmap_fds", []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); }); + + m.def("_set_max_memory_map_allocation_pool_size", [](int32_t size) { + memory::allocation::MemoryMapAllocationPool::Instance().SetMaxPoolSize( + size); + }); #endif m.def("start_imperative_gperf_profiler", diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c7bc5722e3804d49bc7d89210b445f456f4fc4ba..43ee2d479b0b76b0d6851fe2c1b58e06e977fb76 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -970,7 +970,7 @@ All parameter, weight, gradient are variables in Paddle. } }) .def("set_string_list", - [](Variable &self, Strings str_list) { + [](Variable &self, std::vector str_list) { *self.GetMutable() = str_list; }) .def("set_vocab", @@ -1926,7 +1926,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", static_cast &, const std::string &, size_t)>(&framework::SetFeedVariable)); m.def("get_fetch_variable", diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 8739b32965b0dea4e163091f6cbed321ba9590d4..4bdde24f431bc830171ea4d3c02d1064da67926b 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -182,6 +182,7 @@ limitations under the License. */ #include "pybind11/stl.h" DECLARE_bool(use_mkldnn); +DECLARE_bool(use_shm_cache); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); @@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) { // NOLINT int flags = memory::allocation::MAPPED_SHAREDMEM | memory::allocation::MAPPED_EXCLUSIVE; std::string handle = memory::allocation::GetIPCName(); + int find_id = -1; + if (FLAGS_use_shm_cache) { + find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, data_size); // NOLINT + } + if (find_id != -1) { + handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT + } auto shared_holder = memory::allocation::AllocateRefcountedMemoryMapAllocation( - handle, flags, data_size); + handle, flags, data_size, find_id); // copy data & reset holder if (platform::is_cuda_pinned_place(holder->place())) { @@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) { // NOLINT size_t size = t[1].cast(); int flags = memory::allocation::MAPPED_SHAREDMEM | memory::allocation::MAPPED_NOCREATE; - + int find_id = -1; + if (FLAGS_use_shm_cache) { + find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT + } auto shared_holder = memory::allocation::AllocateRefcountedMemoryMapAllocation( - ipc_name, flags, size); + ipc_name, flags, size, find_id); // 3. Rebuild Tensor tensor.ResetHolderWithType( diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index 074da80bbfb6e9d09eb2f5c87094be103bd68c27..73569b387312c7278fd9840dfb362cf752012c0a 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -22,6 +22,9 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/string_tensor_utils.h" #include "paddle/phi/core/tensor_utils.h" +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/backends/device_manager.h" +#endif namespace paddle { namespace experimental { @@ -54,6 +57,11 @@ bool HasAllocation(const phi::TensorBase& t) { BackendSet GetTensorBackendSet(const phi::TensorBase& t) { if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (t.place().GetType() == AllocationType::CUSTOM) { + phi::DeviceManager::SetDevice(t.place()); + } +#endif phi::Backend backend_key = phi::TransToPhiBackend(t.place()); BackendSet backend_set(backend_key); if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) && diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 277ef04c6888b91a83690b4f2cc2b2e9d390d2b9..23158d794019f6313762e1fcebb9aa09af204104 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -431,6 +431,7 @@ kernel : func : exp_grad inplace : (out_grad -> x_grad) + composite : exp_grad(out, out_grad, x_grad) - backward_op : expm1_grad forward : expm1 (Tensor x) -> Tensor(out) @@ -1085,6 +1086,30 @@ func : selu_grad data_type : out +- backward_op : send_u_recv_grad + forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM") + output : Tensor(x_grad) + infer_meta : + func : GeneralUnaryGradInferMeta + param : [x] + kernel : + func : send_u_recv_grad + data_type : out_grad + optional: out, dst_count + +- backward_op : send_ue_recv_grad + forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0}) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : send_ue_recv_grad + data_type : out_grad + optional: out, dst_count + - backward_op : send_uv_grad forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out) args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD") @@ -1191,6 +1216,17 @@ func : sinh_grad inplace : (out_grad -> x_grad) +- backward_op : slogdet_grad + forward : slogdet (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : GeneralUnaryGradInferMeta + param : [x] + kernel : + func : slogdet_grad + data_type : out_grad + - backward_op : softplus_grad forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out) args : (Tensor x, Tensor out_grad, float beta, float threshold) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 3ef309656735de646ffdda08ddc0d9489bc20ab4..f47e206c7ce2fe2742529382ef18092f92571cde 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -316,9 +316,14 @@ - backward_op : cumsum_grad forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out) - args : (Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse) + args : (Tensor x, Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse) output : Tensor(x_grad) - invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse) + infer_meta : + func : UnchangedInferMeta + param: [x] + kernel : + func : cumsum_grad + data_type: x - backward_op : deformable_conv_grad forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out) @@ -475,6 +480,7 @@ func : expand_grad no_need_buffer : x backward : expand_double_grad + composite: expand_grad(x, out_grad, shape, x_grad_p) - backward_op : exponential__grad forward : exponential_ (Tensor x, float lam) -> Tensor(out) @@ -880,6 +886,7 @@ param : [x, y] kernel : func : multiply_grad + composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad) backward : multiply_double_grad - backward_op : multiply_triple_grad @@ -1201,30 +1208,6 @@ data_type : x optional : summed_ids -- backward_op : send_u_recv_grad - forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM") - output : Tensor(x_grad) - infer_meta : - func : GeneralUnaryGradInferMeta - param : [x] - kernel : - func : send_u_recv_grad - data_type : out_grad - optional: out, dst_count - -- backward_op : send_ue_recv_grad - forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op) - output : Tensor(x_grad), Tensor(y_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, y] - kernel : - func : send_ue_recv_grad - data_type : out_grad - optional: out, dst_count - - backward_op : sigmoid_cross_entropy_with_logits_grad forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out) args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index) @@ -1260,16 +1243,6 @@ backward : slice_double_grad no_need_buffer : input -- backward_op : slogdet_grad - forward : slogdet (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : slogdet_grad - - backward_op : softmax_grad forward : softmax (Tensor x, int axis) -> Tensor(out) args : (Tensor out, Tensor out_grad, int axis) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 52db798aecc8f97026417d21ada3ca9345415ec6..049d86473cfc5b5c2c4aea411d783914ced2d9c5 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -1298,16 +1298,6 @@ kernel : func : not_equal -- op : numel - args : (Tensor x) - output : Tensor(size) - infer_meta : - func : NumelInferMeta - kernel : - func : numel - data_transform: - skip_transform : x - - op : one_hot args : (Tensor x, Scalar(int) num_classes) output : Tensor(out) @@ -1588,28 +1578,6 @@ data_type : x backward : segment_pool_grad -- op : send_u_recv - args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) - output : Tensor(out), Tensor(dst_count) - infer_meta : - func : SendURecvInferMeta - kernel : - func : send_u_recv - data_type : x - intermediate : dst_count - backward : send_u_recv_grad - -- op : send_ue_recv - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) - output : Tensor(out), Tensor(dst_count) - infer_meta : - func : SendUERecvInferMeta - kernel : - func : send_ue_recv - data_type : x - intermediate : dst_count - backward : send_ue_recv_grad - - op : sgd_ args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision) output : Tensor(param_out), Tensor(master_param_out) @@ -1663,15 +1631,6 @@ func : slice backward : slice_grad -- op : slogdet - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : slogdet - backward : slogdet_grad - - op : softmax args : (Tensor x, int axis) output : Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index c2b22ba7af5a341f7e436facedc63e62d925de00..1fc4144849b2ac048488f9cfb284ef3970d6a774 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -993,6 +993,12 @@ outputs : {out : Out, total_weight : Total_weight} +- op : numel(size) + inputs : + x : Input + outputs : + size : Out + - op : overlap_add backward : overlap_add_grad inputs : @@ -1215,6 +1221,28 @@ outputs : out : Out +- op : send_u_recv(graph_send_recv) + backward : send_u_recv_grad(graph_send_recv_grad) + inputs : + {x : X, src_index : Src_index, dst_index : Dst_index} + outputs : + {out : Out, dst_count : Dst_count} + int_array : + out_size: + data_type : int64_t + tensor_name : Out_size + +- op : send_ue_recv(graph_send_ue_recv) + backward : send_ue_recv_grad(graph_send_ue_recv_grad) + inputs : + {x : X, y : Y, src_index : Src_index, dst_index : Dst_index} + outputs : + {out : Out, dst_count : Dst_count} + int_array : + out_size: + data_type : int64_t + tensor_name : Out_size + - op : send_uv (graph_send_uv) backward : send_uv_grad (graph_send_uv_grad) @@ -1286,6 +1314,13 @@ extra : attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] +- op : slogdet(slogdeterminant) + backward : slogdet_grad(slogdeterminant_grad) + inputs : + x : Input + outputs : + out : Out + - op : softmax backward : softmax_grad inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 8ea815a28d46550180f2bf46518d7b787980cc37..df606ebec0734ea1220bb53ba501624bba94aba0 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -871,6 +871,18 @@ kernel : func : npu_identity +- op : numel + args : (Tensor x) + output : Tensor(size) + infer_meta : + func : NumelInferMeta + kernel : + func : numel + data_type : x + data_transform: + skip_transform : x + no_need_buffer : x + - op : overlap_add args: (Tensor x, int hop_length, int axis=-1) output: Tensor @@ -1029,6 +1041,28 @@ func : selu backward : selu_grad +- op : send_u_recv + args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) + output : Tensor(out), Tensor(dst_count) + infer_meta : + func : SendURecvInferMeta + kernel : + func : send_u_recv + data_type : x + intermediate : dst_count + backward : send_u_recv_grad + +- op : send_ue_recv + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0}) + output : Tensor(out), Tensor(dst_count) + infer_meta : + func : SendUERecvInferMeta + kernel : + func : send_ue_recv + data_type : x + intermediate : dst_count + backward : send_ue_recv_grad + - op : send_uv args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") output : Tensor(out) @@ -1083,6 +1117,15 @@ func : sinh backward : sinh_grad +- op : slogdet + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : slogdet + backward : slogdet_grad + - op : softplus args : (Tensor x, float beta = 1.0, float threshold = 20.0f) output : Tensor diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index f7888be82b309a2da760661d7020f715fe7e4d63..367231972acbcf6a504ab8dc36b20e5763cf9b0b 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -360,6 +360,7 @@ XPUOpMap& get_kl2_ops() { {"log_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})}, {"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})}, + {"logical_not", XPUKernelSet({phi::DataType::BOOL})}, {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})}, {"masked_select", diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 4065306abc7989990626b6265025aaf4fe2d2763..7b864198129f9a71feac9b03b3bb083c5e3422ef 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -61,11 +61,13 @@ struct XPUContext::Impl { ~Impl() { if (owned_ && context_ != nullptr) { backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); - // manually destroy XPUStream here until xpu::api integrates this work - // into Context dtor xpu_wait(context_->xpu_stream); - xpu_stream_destroy(context_->xpu_stream); - context_->xpu_stream = nullptr; + if (context_->xpu_stream) { + // manually destroy XPUStream here until xpu::api integrates this work + // into Context dtor + xpu_stream_destroy(context_->xpu_stream); + context_->xpu_stream = nullptr; + } xpu::destroy_context(context_); context_ = nullptr; } @@ -73,11 +75,7 @@ struct XPUContext::Impl { const Place& GetPlace() const { return place_; } - XPUStream stream() const { - auto s = context_->xpu_stream; - PD_CHECK(s != nullptr, "the xpu stream is nullptr."); - return s; - } + XPUStream stream() const { return context_->xpu_stream; } xpu::Context* GetXContext() const { PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); @@ -103,13 +101,20 @@ struct XPUContext::Impl { context_ = xpu::create_context(); xpu_version_ = backends::xpu::get_xpu_version(place_.device); SetL3Cache(); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream)); } void SetXContext(xpu::Context* context) { context_ = context; } void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } + void CreateStream() { + if (context_->xpu_stream) { + VLOG(3) << "xpu stream is already created for current context"; + return; + } + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream)); + } + bool owned_{false}; Place place_; backends::xpu::XPUVersion xpu_version_; @@ -153,6 +158,8 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) { impl_->SetBkclContext(context); } +void XPUContext::CreateStream() { impl_->CreateStream(); } + void XPUContext::Init() { impl_->Init(); } } // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 1c12c7e5fe69a490b19e21ed8a1646422593a433..731a3e16c42621b673c7cef8da3166209cc60150 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -46,6 +46,7 @@ class XPUContext : public DeviceContext, // Return bkcl context. xpu::BKCLContext_t bkcl_context() const; void SetBkclContext(xpu::BKCLContext_t context); + void CreateStream(); // Wait for all operations completion in the stream. void Wait() const override; diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 2ddafdac520eb72b6dee5a0ebb161b78b51b0813..cd9c24436da97eadd772100e99f71da441317d9e 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -134,7 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { default: { size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); - std::string device_type = phi::GetGlobalDeviceType(device_type_id_); + std::string device_type = + phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType( + device_type_id_); if (!device_type.empty()) { os << device_type; } else { @@ -178,7 +180,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::IPU; } else { return static_cast(static_cast(Backend::NUM_BACKENDS) + - phi::GetOrRegisterGlobalDeviceTypeId(s)); + phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(s)); } } @@ -207,7 +210,9 @@ inline std::string BackendToString(const Backend& backend) { default: size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); - std::string device_type = phi::GetGlobalDeviceType(device_type_id_); + std::string device_type = + phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType( + device_type_id_); if (!device_type.empty()) { return device_type; } else { diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index d2719f4a0732a4b08b4352bf15bab913703e8a0e..30346d8727f64e3ed0ca8fed10b1b5c194d144c0 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include -#include #include "glog/logging.h" #include "paddle/phi/api/ext/exception.h" @@ -54,7 +53,8 @@ std::string Place::DebugString() const { std::ostringstream os; os << "Place("; if (alloc_type_ == AllocationType::CUSTOM) { - os << GetGlobalDeviceType(device_type_id_); + os << phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType( + device_type_id_); } else { os << AllocationTypeStr(alloc_type_); } @@ -85,25 +85,29 @@ Place GetPinnedPlace(const Place &place) { } } -static std::unordered_map global_registered_device_type_id; -static std::unordered_map global_registered_device_type; +CustomRegisteredDeviceMap &CustomRegisteredDeviceMap::Instance() { + static CustomRegisteredDeviceMap g_custom_registered_device_map; + return g_custom_registered_device_map; +} -size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) { +size_t CustomRegisteredDeviceMap::GetOrRegisterGlobalDeviceTypeId( + const std::string &device_type) { if (device_type.empty()) return 0; - if (global_registered_device_type_id.find(device_type) == - global_registered_device_type_id.end()) { - size_t device_type_id = global_registered_device_type_id.size() + 1; - global_registered_device_type_id[device_type] = device_type_id; - global_registered_device_type[device_type_id] = device_type; + if (registered_device_type_id_.find(device_type) == + registered_device_type_id_.end()) { + size_t device_type_id = registered_device_type_id_.size() + 1; + registered_device_type_id_[device_type] = device_type_id; + registered_device_type_[device_type_id] = device_type; } - return global_registered_device_type_id[device_type]; + return registered_device_type_id_[device_type]; } -std::string GetGlobalDeviceType(size_t device_type_id) { - if (global_registered_device_type.find(device_type_id) == - global_registered_device_type.end()) +std::string CustomRegisteredDeviceMap::GetGlobalDeviceType( + size_t device_type_id) { + if (registered_device_type_.find(device_type_id) == + registered_device_type_.end()) return ""; - return global_registered_device_type[device_type_id]; + return registered_device_type_[device_type_id]; } constexpr static int kAllocationTypeBitLength = 8; @@ -143,7 +147,9 @@ static int8_t GetCorrectDeviceIdByPlaceType( Place::Place(paddle::PlaceType type) : device(detail::GetCorrectDeviceIdByPlaceType(type)), alloc_type_(static_cast(type)), - device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) { + device_type_id_( + CustomRegisteredDeviceMap::Instance().GetOrRegisterGlobalDeviceTypeId( + "")) { LOG_FIRST_N(WARNING, 1) << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version " "2.3, and will be removed in version 2.4! Please use " diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index 12aa6f90ed91636ca7e554bf13199cc74334ede3..a21a53776b6e11901e7a1c417892a6f5e8317d6f 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/phi/api/include/dll_decl.h" @@ -37,11 +38,21 @@ enum class AllocationType : int8_t { CUSTOM = 9, }; -const char* AllocationTypeStr(AllocationType type); +class CustomRegisteredDeviceMap { + public: + static CustomRegisteredDeviceMap& Instance(); -size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); + size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); -std::string GetGlobalDeviceType(size_t device_type_id_); + std::string GetGlobalDeviceType(size_t device_type_id_); + + private: + CustomRegisteredDeviceMap() = default; + std::unordered_map registered_device_type_id_; + std::unordered_map registered_device_type_; +}; + +const char* AllocationTypeStr(AllocationType type); /// \brief The place is used to specify where the data is stored. class PADDLE_API Place { @@ -53,12 +64,14 @@ class PADDLE_API Place { const std::string& dev_type = "") : device(id), alloc_type_(type), - device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + device_type_id_(phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(dev_type)) {} explicit Place(AllocationType type, const std::string& dev_type = "") : device(0), alloc_type_(type), - device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + device_type_id_(phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(dev_type)) {} // See NOTE [ Why need to temporarily adapt to PlaceType? ] Place(paddle::PlaceType type); // NOLINT @@ -69,7 +82,8 @@ class PADDLE_API Place { alloc_type_ = type; device = device_id; if (!dev_type.empty()) { - device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type); + device_type_id_ = phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(dev_type); } } @@ -78,7 +92,8 @@ class PADDLE_API Place { int8_t GetDeviceId() const { return device; } std::string GetDeviceType() const { - return GetGlobalDeviceType(device_type_id_); + return phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType( + device_type_id_); } std::string DebugString() const; diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index 4e3447fe9eb22c498b4f33524ebff5e56a7e1e79..2dd3ba8b767027009877542bc15fac35ea72075c 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -110,6 +110,7 @@ class ArgumentMappingContext { virtual bool IsSelectedRowsInput(const std::string& name) const = 0; virtual bool IsSelectedRowsInputs(const std::string& name) const = 0; virtual bool IsSparseCooTensorInput(const std::string& name) const = 0; + virtual bool IsSparseCooTensorOutput(const std::string& name) const = 0; virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0; // For compatibility with LoDTensorArray virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0; diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 03fb3f3e6b6b668a719a3c0af4976c26a826c2d2..149c62f11677e425c09e4de18836e86f95670c2c 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -46,7 +46,8 @@ Backend TransToPhiBackend(const phi::Place& place) { case AllocationType::CUSTOM: return static_cast( static_cast(Backend::NUM_BACKENDS) + - GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType())); + phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType())); default: PADDLE_THROW(phi::errors::InvalidArgument( "Unsupported transform %s to phi Backend.", place)); @@ -91,7 +92,9 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { #ifdef PADDLE_WITH_CUSTOM_DEVICE size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); - std::string device_type = phi::GetGlobalDeviceType(device_type_id_); + std::string device_type = + phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType( + device_type_id_); if (!device_type.empty()) { return phi::CustomPlace( device_type, diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 5b8dc47d6498177124bdc53be2d27f97546fe086..8b5a78575d220d7d04c7f1046237d79053f8cb30 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -1010,6 +1010,18 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, #endif +/* + * CUDA Graph related FLAG + * Name: FLAGS_new_executor_use_cuda_graph + * Since Version: 2.4 + * Value Range: bool, default=false + * Example: FLAGS_new_executor_use_cuda_graph=true would allow + * new executor to use CUDA Graph. + */ +PADDLE_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph, + false, + "Use CUDA Graph in new executor"); + DEFINE_int32(record_pool_max_size, 2000000, "SlotRecordDataset slot record pool max size"); @@ -1181,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache, false, "Add a persistent ibuilder."); + +/** + * mmap_allocator related FLAG + * Name: use_shm_cache + * Since Version: 2.5.0 + * Value Range: bool, default=true + * Example: + * Note: . If True, mmap_allocator will cache shm file to decrease munmap + * operation. + */ +PADDLE_DEFINE_EXPORTED_bool(use_shm_cache, + true, + "Use shm cache in mmap_allocator."); diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index a195f2ad60cbcb5fddc1e712e72c7ce263d75785..dc6f657fee67690b4b406facd1f9257a8b21d306 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -101,6 +101,12 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == + std::type_index(typeid(const phi::ExtendedTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid( const std::vector&))) { args_def->AppendInput(default_key.backend(), diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 1eb3a52aebad11c571e645451415fbeca3edac4f..eb18d0cb98c5b8cc03af3e3804a0af9466d231b8 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -265,6 +265,7 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(ExtendedTensor); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(ExtendedTensor); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(TensorBase); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SelectedRows); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 002af75c04c1facb11f83d9c2f29374af7d97c41..561938adca80a22cc3700baab3dc58c8bf9a6321 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -381,7 +381,11 @@ void CompareAllInferMeta(const MetaTensor& x, errors::InvalidArgument( "The size of dim_y should not be greater than dim_x's.")); out->share_lod(x); - out->set_dims(make_ddim({1})); + if (!x.dims().size() || !y.dims().size()) { + out->set_dims(make_ddim({})); + } else { + out->set_dims(make_ddim({1})); + } out->set_dtype(DataType::BOOL); } diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 04373fa29edf9e58661d8c52cf85d08be0ceaf13..5a7b2cf16a1f8cdc896da44193befe43674b23cd 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -424,10 +424,36 @@ void CumInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim({phi::product(x_dims)})); out->set_dtype(x.dtype()); } else { + if (x_dims.size() > 0) { + PADDLE_ENFORCE_GE( + axis, + -x_dims.size(), + phi::errors::OutOfRange( + "axis is out of range (expected to be in range of [%ld, " + "%ld), but got %ld).", + -(x_dims.size()), + x_dims.size(), + axis)); + PADDLE_ENFORCE_LT( + axis, + x_dims.size(), + phi::errors::OutOfRange( + "axis is out of range (expected to be in range of [%ld, " + "%ld), but got %ld).", + -(x_dims.size()), + x_dims.size(), + axis)); + } else { + PADDLE_ENFORCE_EQ( + (axis == 0 || axis == -1), + true, + errors::InvalidArgument("The axis must be -1 or 0 in 0D Tensor, " + "but the value given is %d.", + axis)); + } out->set_dims(x_dims); out->set_dtype(x.dtype()); } - out->share_lod(x); } diff --git a/paddle/phi/kernels/cpu/cum_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..32be44661348fbf8e3e4e6713637af88c89fe560 --- /dev/null +++ b/paddle/phi/kernels/cpu/cum_grad_kernel.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cum_grad_kernel.h" +#include "paddle/phi/kernels/cum_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +namespace phi { + +template +void CumsumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const Scalar& axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* x_grad) { + x_grad->Resize(x.dims()); + CumsumKernel( + dev_ctx, out_grad, axis, flatten, exclusive, !reverse, x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumsum_grad, + CPU, + ALL_LAYOUT, + phi::CumsumGradKernel, + float, + double, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/cum_kernel.cc b/paddle/phi/kernels/cpu/cum_kernel.cc index 2b6a9be371afb6f2c64355e241c2611571bca395..f7ec5bbbf9e844fc51538641e180018c32b9e438 100644 --- a/paddle/phi/kernels/cpu/cum_kernel.cc +++ b/paddle/phi/kernels/cpu/cum_kernel.cc @@ -57,6 +57,14 @@ void ScanKernel(const Context& dev_ctx, bool reverse, Reducer reducer, DenseTensor* out) { + dev_ctx.template Alloc(out); + + if (x.numel() == 1) { + auto raw_dims = out->dims(); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + out->Resize(raw_dims); + return; + } auto out_dims = out->dims(); PADDLE_ENFORCE_EQ( @@ -72,8 +80,6 @@ void ScanKernel(const Context& dev_ctx, axis += out_dims.size(); } - dev_ctx.template Alloc(out); - int pre = 1; int post = 1; int mid = out_dims[axis]; diff --git a/paddle/phi/kernels/cum_grad_kernel.h b/paddle/phi/kernels/cum_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f2428524fe5a2fa87bbe20ce417538e708fa8ab4 --- /dev/null +++ b/paddle/phi/kernels/cum_grad_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumsumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const Scalar& axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 81c4faeb8182fe61c3eaac890f3f2ff2bbc76da9..bf4553f3ab7b417fc2fa366336c91b1b92d1a054 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -381,7 +381,11 @@ void SparseBlas::SPMV(bool transa, &beta, out_descriptor.descriptor(), gpu_type, +#if CUDA_VERSION >= 11040 + CUSPARSE_SPMV_ALG_DEFAULT, +#else CUSPARSE_MV_ALG_DEFAULT, +#endif &buffer_size); }); @@ -399,7 +403,11 @@ void SparseBlas::SPMV(bool transa, &beta, out_descriptor.descriptor(), gpu_type, +#if CUDA_VERSION >= 11040 + CUSPARSE_SPMV_ALG_DEFAULT, +#else CUSPARSE_MV_ALG_DEFAULT, +#endif tmp_buffer_ptr); }); } diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6039c313b78ed007f74c093111b76a0efe399f30 --- /dev/null +++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu @@ -0,0 +1,75 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cum_grad_kernel.h" +#include "paddle/phi/kernels/cum_kernel.h" + +#include +#include +#include +#include +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void CumsumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const Scalar& axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* x_grad) { + x_grad->Resize(x.dims()); + CumsumKernel( + dev_ctx, out_grad, axis, flatten, exclusive, !reverse, x_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(cumsum_grad, + GPU, + ALL_LAYOUT, + phi::CumsumGradKernel, + float, + double, + int16_t, + int, + int64_t) {} +#else +PD_REGISTER_KERNEL(cumsum_grad, + GPU, + ALL_LAYOUT, + phi::CumsumGradKernel, + float, + double, + int16_t, + int, + int64_t, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu index 0c6cd8b5562af4897238e37dc77901224d6a0621..9bf06d7bf19dcd763263447629d3521516bcf736 100644 --- a/paddle/phi/kernels/gpu/cum_kernel.cu +++ b/paddle/phi/kernels/gpu/cum_kernel.cu @@ -270,6 +270,16 @@ void ScanKernel(const Context& dev_ctx, bool reverse, Op op, DenseTensor* out) { + T* out_data = dev_ctx.template Alloc(out); + + // For 0D Tensor + if (out->numel() == 1) { + auto raw_dims = out->dims(); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + out->Resize(raw_dims); + return; + } + auto out_dims = out->dims(); auto size = x.numel(); @@ -286,7 +296,6 @@ void ScanKernel(const Context& dev_ctx, axis += out_dims.size(); } - T* out_data = dev_ctx.template Alloc(out); const T* in_data = x.data(); // Use thrust for parallel acceleration when the input size is equal to the diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu index 0f000af536d8b1b7817afbf5acac141d3de9eec7..85ca46d7e07351c712553a695045e1241afafc34 100644 --- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/send_u_recv_kernel.h" #include +#include #include #include diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu index aaae915f9df3e68c5a24f66ca18aaf1bdc4a9913..834a93d629d064f2095eee1041898ade18c7ec1d 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/send_ue_recv_kernel.h" #include +#include #include #include #include diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu index 316fe1fae711637af9cafb7ec0c06693aae7f0a6..d420c8f438b16b8614c89a2c0640bf5c5446f388 100644 --- a/paddle/phi/kernels/gpu/unique_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index d499cdf54abef913ef9d8c2012e354f81e2cb4e6..b9089dad71657930ef2e108606446f0dca4d248e 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -14,6 +14,9 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/coalesce_kernel.h" +#include +#include + #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/xpu/cum_grad_kernel.cc b/paddle/phi/kernels/xpu/cum_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7b11ba47f0a79c708e9662cd158f024188cbb8f3 --- /dev/null +++ b/paddle/phi/kernels/xpu/cum_grad_kernel.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cum_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" +namespace phi { + +template +void CumsumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const Scalar& axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* x_grad) { + x_grad->Resize(x.dims()); + CumsumKernel( + dev_ctx, out_grad, axis, flatten, exclusive, !reverse, x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + cumsum_grad, XPU, ALL_LAYOUT, phi::CumsumGradKernel, float, int, int64_t) {} diff --git a/paddle/phi/kernels/xpu/cum_kernel.cc b/paddle/phi/kernels/xpu/cum_kernel.cc index 17eca4008607e65457ccae3b813dc43d5e92ac44..13a1dab66d72f28665ce2d27558230a37e457a0a 100644 --- a/paddle/phi/kernels/xpu/cum_kernel.cc +++ b/paddle/phi/kernels/xpu/cum_kernel.cc @@ -30,6 +30,15 @@ void CumsumKernel(const Context& dev_ctx, using XPUType = typename XPUTypeTrait::Type; dev_ctx.template Alloc(out); + if (x.numel() == 1) { + int r = xpu::copy(dev_ctx.x_context(), + reinterpret_cast(x.data()), + reinterpret_cast(out->data()), + x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + return; + } + // prepare for call xdnn api std::vector x_shape = phi::vectorize(x.dims()); int axis_as_int = axis.to(); diff --git a/paddle/phi/kernels/xpu/logical_kernel.cc b/paddle/phi/kernels/xpu/logical_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6a0ea242d992670131e596d43ec8e16fde1de0d --- /dev/null +++ b/paddle/phi/kernels/xpu/logical_kernel.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/logical_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void LogicalNotKernel(const Context& ctx, + const DenseTensor& x, + DenseTensor* out) { + ctx.template Alloc(out); + int r = + xpu::logical_not(ctx.x_context(), x.data(), out->data(), x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "logical_not"); +} +} // namespace phi + +PD_REGISTER_KERNEL(logical_not, XPU, ALL_LAYOUT, phi::LogicalNotKernel, bool) {} diff --git a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc index 52a98c63f48987a348e0df4e15d4469f7402f7c3..8e2f56adfa14147c240949c9d1f483098037cc6b 100644 --- a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc @@ -33,6 +33,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx, auto mask_shape = phi::vectorize(mask.dims()); auto xshape = phi::vectorize(x_grad->dims()); + if (mask.dims().size() == 0) { + mask_shape = std::vector({1}); + } + if (x_grad->dims().size() == 0) { + xshape = std::vector({1}); + } int r = xpu::masked_select_grad(dev_ctx.x_context(), input_data, diff --git a/paddle/phi/kernels/xpu/masked_select_kernel.cc b/paddle/phi/kernels/xpu/masked_select_kernel.cc index 0f142e852a9a7fd9f77ff10fc439938b89c7b3c4..c572b5c6e4eb741a5b08edac13c227cfaa479663 100644 --- a/paddle/phi/kernels/xpu/masked_select_kernel.cc +++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc @@ -61,6 +61,12 @@ void MaskedSelectKernel(const Context& dev_ctx, auto input_shape = vectorize(input_dim); auto mask_shape = vectorize(mask_dim); + if (input_dim.size() == 0) { + input_shape = std::vector({1}); + } + if (mask_dim.size() == 0) { + mask_shape = std::vector({1}); + } if (out_size_cpu > 0) { PADDLE_ENFORCE_XDNN_SUCCESS(xpu::masked_select(dev_ctx.x_context(), diff --git a/paddle/phi/ops/compat/size_sig.cc b/paddle/phi/ops/compat/cumsum_sig.cc similarity index 57% rename from paddle/phi/ops/compat/size_sig.cc rename to paddle/phi/ops/compat/cumsum_sig.cc index 46177e4ae35b991210b52f2024ba3031c26aff4a..00992b15435d2153ccd38d95689ce9e1ee9f31bc 100644 --- a/paddle/phi/ops/compat/size_sig.cc +++ b/paddle/phi/ops/compat/cumsum_sig.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,7 +11,17 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/phi/core/compat/op_utils.h" -PD_REGISTER_BASE_KERNEL_NAME(size, numel); +namespace phi { + +KernelSignature CumsumOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("cumsum_grad", + {"X", "Out@GRAD"}, + {"axis", "flatten", "exclusive", "reverse"}, + {"X@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(cumsum_grad, phi::CumsumOpArgumentMapping); diff --git a/paddle/phi/ops/compat/slogdeterminant_sig.cc b/paddle/phi/ops/compat/feed_sig.cc similarity index 57% rename from paddle/phi/ops/compat/slogdeterminant_sig.cc rename to paddle/phi/ops/compat/feed_sig.cc index 2e63a90d929085405fd2dbd16647093482094eb9..e28715ce70c63ec06b983a317cbc2eeee6d3346d 100644 --- a/paddle/phi/ops/compat/slogdeterminant_sig.cc +++ b/paddle/phi/ops/compat/feed_sig.cc @@ -16,16 +16,17 @@ namespace phi { -KernelSignature SlogDeterminantGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "slogdet_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); +KernelSignature FeedOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorOutput("Out")) { + return KernelSignature("feed_dense_tensor", {"X"}, {"col"}, {"Out"}); + } else if (ctx.IsSparseCooTensorOutput("Out")) { + return KernelSignature("feed_sparse_coo_tensor", {"X"}, {"col"}, {"Out"}); + } else { + return KernelSignature("feed_strings", {"X"}, {"col"}, {"Out"}); + } } } // namespace phi -PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant, slogdet); -PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant_grad, slogdet_grad); - -PD_REGISTER_ARG_MAPPING_FN(slogdeterminant_grad, - phi::SlogDeterminantGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(feed, feed_dense_tensor); +PD_REGISTER_ARG_MAPPING_FN(feed, phi::FeedOpArgumentMapping); diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc deleted file mode 100644 index ef8eeae358e097b716bd8510ec4d576da7959607..0000000000000000000000000000000000000000 --- a/paddle/phi/ops/compat/graph_send_recv_sig.cc +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature GraphSendRecvOpArgumentMapping( - const ArgumentMappingContext& ctx) { - if (ctx.HasInput("Out_size")) { - return KernelSignature("send_u_recv", - {"X", "Src_index", "Dst_index"}, - {"reduce_op", "Out_size"}, - {"Out", "Dst_count"}); - } else { - return KernelSignature("send_u_recv", - {"X", "Src_index", "Dst_index"}, - {"reduce_op", "out_size"}, - {"Out", "Dst_count"}); - } -} - -KernelSignature GraphSendRecvGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "send_u_recv_grad", - {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, - {"reduce_op"}, - {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv, send_u_recv); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv_grad, send_u_recv_grad); - -PD_REGISTER_ARG_MAPPING_FN(graph_send_recv, - phi::GraphSendRecvOpArgumentMapping); - -PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad, - phi::GraphSendRecvGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc deleted file mode 100644 index aab850831ae334cc1e57f4614b9af7da1b2a014e..0000000000000000000000000000000000000000 --- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature GraphSendUERecvOpArgumentMapping( - const ArgumentMappingContext& ctx) { - if (ctx.HasInput("Out_size")) { - return KernelSignature("send_ue_recv", - {"X", "Y", "Src_index", "Dst_index"}, - {"message_op", "reduce_op", "Out_size"}, - {"Out", "Dst_count"}); - } else { - return KernelSignature("send_ue_recv", - {"X", "Y", "Src_index", "Dst_index"}, - {"message_op", "reduce_op", "out_size"}, - {"Out", "Dst_count"}); - } -} - -KernelSignature GraphSendUERecvGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "send_ue_recv_grad", - {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, - {"message_op", "reduce_op"}, - {"X@GRAD", "Y@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv, send_ue_recv); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv_grad, send_ue_recv_grad); - -PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv, - phi::GraphSendUERecvOpArgumentMapping); - -PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv_grad, - phi::GraphSendUERecvGradOpArgumentMapping); diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index 415c1f21465edd37ed96285d07ed74b8f46012f1..791167ffe62a6449d7eafa75eb5fd976c909874c 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -70,8 +70,10 @@ TEST(Backend, StringToBackend) { #else EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS")); #endif - EXPECT_EQ(static_cast( - static_cast(phi::Backend::NUM_BACKENDS) + 1), + EXPECT_EQ(static_cast( + static_cast(Backend::NUM_BACKENDS) + + phi::CustomRegisteredDeviceMap::Instance() + .GetOrRegisterGlobalDeviceTypeId("CustomBackend")), pexp::StringToBackend("CustomBackend")); } diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h index 7f89fb34994fcd8186289cea3a7aee9ad565f08d..c267e60509120be24fa470e835df0f84336c1195 100644 --- a/paddle/phi/tests/ops/test_op_signature.h +++ b/paddle/phi/tests/ops/test_op_signature.h @@ -94,6 +94,10 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext { return false; } + bool IsSparseCooTensorOutput(const std::string& name) const override { + return false; + } + bool IsDenseTensorOutput(const std::string& name) const override { return dense_tensor_outputs.count(name) > 0; } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8449b7158cd2c82b9ad9f89d26a2144bbd7c83e1..4c48154b80a4b21a063af640b122e5b1b296284e 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -3461,7 +3461,10 @@ function trt_convert_test() { function build_pr_and_develop() { run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number} - mkdir ${PADDLE_ROOT}/build/python/dist/ && mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/ + if [ ! -d "${PADDLE_ROOT}/build/python/dist/" ]; then + mkdir ${PADDLE_ROOT}/build/python/dist/ + fi + mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/ cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true` cp ${PADDLE_ROOT}/python/requirements.txt /tmp generate_api_spec "$1" "PR" @@ -3483,7 +3486,10 @@ function build_pr_and_develop() { else git checkout -b develop_base_pr upstream/$BRANCH run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number} - mkdir ${PADDLE_ROOT}/build/python/dist/ && mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/ + if [ ! -d "${PADDLE_ROOT}/build/python/dist/" ]; then + mkdir ${PADDLE_ROOT}/build/python/dist/ + fi + mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/ generate_api_spec "$1" "DEV" mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl fi diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index cef75772283166f052befe0cadccc949a7e276a3..3e78b716faabb1ba5dfaa44a10b808e0e1a9d3eb 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -55,7 +55,7 @@ from .framework.dtype import bool # noqa: F401 from .framework.dtype import complex64 # noqa: F401 from .framework.dtype import complex128 # noqa: F401 -if fluid.framework._in_eager_mode_: +if fluid.framework.global_var._in_eager_mode_: Tensor = framework.core.eager.Tensor else: from .framework import VarBase as Tensor # noqa: F401 diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py index 64c2423d4395e8708f673fc8fad83b5a645f83b9..ce017ef98540d3165a0e82e469f3bb303eb88b98 100644 --- a/python/paddle/autograd/backward_mode.py +++ b/python/paddle/autograd/backward_mode.py @@ -107,7 +107,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False): each_tensor, (paddle.Tensor, core.eager.Tensor) ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'." else: - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: grad_tensors = [] else: grad_tensors = [None] * len(tensors) @@ -119,7 +119,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False): assert isinstance(retain_graph, bool), "retain_graph must be True or False" - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: core.eager.run_backward(tensors, grad_tensors, retain_graph) else: core.dygraph_run_backward( diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index df975d06a45d4d568f136dcdaf777887066db8de..76401d5c47a9aba70631a88ad05edf4b37db2f79 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -385,12 +385,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): def _create_loss_op_desc_(loss): + create_shape = [] if len(loss.shape) == 0 else [1] op_desc = _create_op_desc_( "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, { - "shape": [1], + "shape": create_shape, "value": 1.0, "dtype": loss.dtype, "force_cpu": False, @@ -1491,11 +1492,15 @@ def _append_backward_ops_( ) # remove some backward ops - not_need_ops = _find_not_need_ops(grad_op_descs, ops, input_grad_names_set) + # TODO(Jiabin): Support this in prime later, it will prune add_grad, fix this problem + if not core.is_prim_enabled(): + not_need_ops = _find_not_need_ops( + grad_op_descs, ops, input_grad_names_set + ) - grad_op_descs = [ - op_desc for op_desc in grad_op_descs if op_desc not in not_need_ops - ] + grad_op_descs = [ + op_desc for op_desc in grad_op_descs if op_desc not in not_need_ops + ] # append op_desc in grad_op_descs to target_block op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 299ca1e36614b6189f484818419bca81d60fbf58..771caa4ef3c4fa822dd40d0077b90e778bbbb5d6 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -317,6 +317,7 @@ try: from .libpaddle import _array_to_share_memory_tensor from .libpaddle import _cleanup_mmap_fds from .libpaddle import _remove_tensor_list_mmap_fds + from .libpaddle import _set_max_memory_map_allocation_pool_size except Exception as e: if has_paddle_dy_lib: sys.stderr.write( @@ -371,3 +372,37 @@ def set_paddle_lib_path(): set_paddle_lib_path() + + +def set_prim_forward(value): + """set flag FLAGS_prim_forward.""" + flag = str(value) + if flag.lower() not in ["true", "false", "debug"]: + raise TypeError(f"flag {flag} should be string of bool or 'debug'.") + os.environ["FLAGS_prim_forward"] = flag + return + + +def enable_prim_forward(): + flag = os.getenv("FLAGS_prim_forward", "true").lower() + if flag == "false": + return False + if flag == "debug": + return "debug" + return True + + +def set_prim_backward(value): + """set flag FLAGS_prim_backward,""" + flag = str(value) + if flag.lower() not in ["true", "false"]: + raise TypeError(f"flag {flag} should be bool or string of bool.") + os.environ["FLAGS_prim_backward"] = flag + return + + +def enable_prim_backward(): + flag = os.getenv("FLAGS_prim_backward", "true") + if flag.lower() == "false": + return False + return True diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index fc1effbd89c7afd704a3b6bb657dc9c11d395312..c7c49c794a1017b1788a489cde6030d4ca374c1b 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -20,6 +20,7 @@ import numbers import logging import itertools import threading +import warnings import numpy as np from collections import namedtuple from paddle.fluid.framework import ( @@ -406,6 +407,20 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._base_seed = np.random.randint(low=0, high=sys.maxsize) + # Note(zhangbo): shm_buffer_size is used for MemoryMapAllocationPool. + # MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in dataloader. + # For more details, please see: paddle/fluid/memory/allocation/mmap_allocator.h + try: + self._worker_shm_buffer_size = (2 + 1) * len(self._dataset[0]) + except: + self._worker_shm_buffer_size = 0 + warnings.warn( + "Setting the shm cache buffer size to 0, equivalent to not using the shm cache policy." + ) + self._main_thread_shm_buffer_size = ( + (self._worker_shm_buffer_size) * 2 * self._num_workers + ) + # init workers and indices queues and put 2 indices in each indices queue self._init_workers() for _ in range(self._outstanding_capacity): @@ -450,6 +465,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._num_workers, self._use_shared_memory, self._base_seed, + self._worker_shm_buffer_size, ), ) worker.daemon = True @@ -481,6 +497,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._blocking_queue = core.init_lod_tensor_blocking_queue( core.Variable(), self._outstanding_capacity, len(self._places) > 1 ) + core._set_max_memory_map_allocation_pool_size( + self._main_thread_shm_buffer_size + ) self._reader = core.create_py_reader( self._blocking_queue, self._var_names, diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py index e63a5d5f9344142626cf6f8093e509183b92d732..f486e80d746ea7de44750abea89b1ebece6f62f3 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/fluid/dataloader/worker.py @@ -275,6 +275,7 @@ def _worker_loop( num_workers, use_shared_memory, base_seed, + shm_cahce_size=0, ): try: # NOTE: [ mmap files clear ] When the child process exits unexpectedly, @@ -286,6 +287,8 @@ def _worker_loop( # set signal handler core._set_process_signal_handler() + core._set_max_memory_map_allocation_pool_size(shm_cahce_size) + # set different numpy seed for each worker try: import numpy as np diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index d064c5194e031d31ed2bbd77e63c8f86b2f7b517..df500a129787da93ff7e0c687074118cad2c0d31 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -20,6 +20,7 @@ import sys import numpy as np from paddle.fluid import core from paddle.fluid import framework +from paddle.fluid.framework import global_var from paddle.fluid.multiprocess_utils import CleanupFuncRegistrar from .tracer import Tracer import logging @@ -44,7 +45,6 @@ __all__ = [ ] # Flag that indicates whether running code under `@to_static` -_in_declarative_mode_ = False def in_declarative_mode(): @@ -52,7 +52,7 @@ def in_declarative_mode(): Return a bool value that indicates whether running code under `@to_static` """ - return _in_declarative_mode_ + return global_var._in_declarative_mode_ def declarative_unsupport_argument_warning( @@ -86,11 +86,11 @@ switch_to_static_graph = wrap_decorator(_switch_to_static_graph_) @signature_safe_contextmanager def _switch_declarative_mode_guard_(is_declarative=True): - global _in_declarative_mode_ - original_val = _in_declarative_mode_ - _in_declarative_mode_ = is_declarative + global global_var + original_val = global_var._in_declarative_mode_ + global_var._in_declarative_mode_ = is_declarative yield - _in_declarative_mode_ = original_val + global_var._in_declarative_mode_ = original_val @signature_safe_contextmanager @@ -106,9 +106,6 @@ def program_desc_tracing_guard(enable): tracer._enable_program_desc_tracing = original_val -_functional_dygraph_context_manager = None - - @signature_safe_contextmanager def param_guard(parameters): # Note: parameters is a reference of self._parameters or self._buffers @@ -228,12 +225,12 @@ def enable_dygraph(place=None): print(paddle.in_dynamic_mode()) # True, Now we are in dynamic mode """ - global _functional_dygraph_context_manager - if _functional_dygraph_context_manager is None: - _functional_dygraph_context_manager = guard( + global global_var + if global_var._functional_dygraph_context_manager is None: + global_var._functional_dygraph_context_manager = guard( place=_get_paddle_place(place) ) - _functional_dygraph_context_manager.__enter__() + global_var._functional_dygraph_context_manager.__enter__() # call disable_dygraph when Python exit CleanupFuncRegistrar.register(disable_dygraph) @@ -263,10 +260,10 @@ def disable_dygraph(): print(paddle.in_dynamic_mode()) # True, Now we are in dynamic mode """ - global _functional_dygraph_context_manager - if _functional_dygraph_context_manager is not None: - _functional_dygraph_context_manager.__exit__(*sys.exc_info()) - _functional_dygraph_context_manager = None + global global_var + if global_var._functional_dygraph_context_manager is not None: + global_var._functional_dygraph_context_manager.__exit__(*sys.exc_info()) + global_var._functional_dygraph_context_manager = None @signature_safe_contextmanager diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index cb78b8b9d5932fa3778ed2cd77db7a6dd53f102f..74a174674f64c028a34b7f6a8c66ffbf6712a86e 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -74,7 +74,7 @@ def monkey_patch_math_varbase(): @no_grad def create_tensor(value, dtype, shape): - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: out = _C_ops.full( shape, value, dtype, framework._current_expected_place() ) @@ -251,7 +251,7 @@ def monkey_patch_math_varbase(): # 2. create varbase for scalar lhs_dtype = self.dtype - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: other_var_should_be = core.eager.Tensor else: other_var_should_be = core.VarBase @@ -486,7 +486,7 @@ def monkey_patch_math_varbase(): global _already_patch_varbase global _already_patch_eager_tensor - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: local_already_patch = _already_patch_eager_tensor _already_patch_eager_tensor = True local_tensor = core.eager.Tensor @@ -496,7 +496,7 @@ def monkey_patch_math_varbase(): local_tensor = core.VarBase if not local_already_patch: - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: for method_name in eager_cpp_level_patch: method_impl = getattr(local_tensor, method_name, None) if method_impl: diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 2aa9a822aa99041c3c226dd8fada6fe5a1a5a59f..9f0d8297f349b4a5b1df64e0983ed29aeb19899d 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -54,7 +54,9 @@ class TensorHookRemoveHelper: def __init__(self, tensor, hook_id): self._tensor = ( - tensor if framework._in_eager_mode_ else weakref.ref(tensor) + tensor + if framework.global_var._in_eager_mode_ + else weakref.ref(tensor) ) self._hook_id = hook_id @@ -65,7 +67,11 @@ class TensorHookRemoveHelper: Returns: bool: Return True if removed successfully """ - tensor = self._tensor if framework._in_eager_mode_ else self._tensor() + tensor = ( + self._tensor + if framework.global_var._in_eager_mode_ + else self._tensor() + ) if tensor is not None: res = tensor._remove_grad_hook(self._hook_id) if res is True: @@ -178,7 +184,7 @@ def monkey_patch_varbase(): out = linear(t) # call with different weight """ - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: base_tensor = core.eager.Tensor else: base_tensor = core.VarBase @@ -282,7 +288,7 @@ def monkey_patch_varbase(): ) record_event.begin() if grad_tensor is not None: - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: assert isinstance( grad_tensor, core.eager.Tensor ), "The type of grad_tensor must be paddle.Tensor" @@ -296,7 +302,7 @@ def monkey_patch_varbase(): grad_tensor.name, grad_tensor.shape, self.name, self.shape ) - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: if grad_tensor is None: grad_tensor = [] else: @@ -311,7 +317,7 @@ def monkey_patch_varbase(): ): # TODO(liuyuhui): Currently only for xpu. Will be removed in the future. scaled_loss = scale_loss(self) - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: core.eager.run_backward( [scaled_loss], grad_tensor, retain_graph ) @@ -323,7 +329,7 @@ def monkey_patch_varbase(): framework._dygraph_tracer(), ) else: - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: core.eager.run_backward([self], grad_tensor, retain_graph) else: core.dygraph_run_backward( @@ -368,7 +374,7 @@ def monkey_patch_varbase(): # [500.] """ - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: if self.grad is None: return None if self.grad.is_selected_rows(): @@ -673,7 +679,7 @@ def monkey_patch_varbase(): # [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436], # [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]]) """ - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: from paddle.tensor.to_string import tensor_to_string return tensor_to_string(self) @@ -707,7 +713,7 @@ def monkey_patch_varbase(): raise RuntimeError( "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy" ) - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: new_varbase = core.eager.Tensor() else: new_varbase = core.VarBase() @@ -725,7 +731,7 @@ def monkey_patch_varbase(): assert ( numel == 1 ), "When Variable is used as the condition of if/while , Variable can only contain one element." - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: assert self._is_initialized(), "tensor not initialized" return bool(np.all(self.numpy() > 0)) else: @@ -850,7 +856,7 @@ def monkey_patch_varbase(): return _setitem_impl_(self, item, value) else: - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: return self.__setitem_eager_tensor__(item, value) else: # Call c++ func __setitem_varbase__ to speedup. @@ -1020,7 +1026,7 @@ def monkey_patch_varbase(): def __hash__(self): return hash(id(self)) - if framework._in_eager_mode_ and not hasattr(core, "eager"): + if framework.global_var._in_eager_mode_ and not hasattr(core, "eager"): return for method_name, method in ( @@ -1047,12 +1053,12 @@ def monkey_patch_varbase(): ("to_dense", to_dense), ("to_sparse_coo", to_sparse_coo), ): - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: setattr(core.eager.Tensor, method_name, method) else: setattr(core.VarBase, method_name, method) - if framework._in_eager_mode_: + if framework.global_var._in_eager_mode_: setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar) setattr(core.eager.Tensor, "value", value) setattr(core.eager.Tensor, "cpu", cpu) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index d5db9a7f72c0de77c7f8ee6a3467bf6f721bcf01..e3376d8446586607947bf2143fe5c9fe32115dac 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -26,6 +26,7 @@ from .framework import convert_np_dtype_to_dtype_, _apply_pass from . import core from . import unique_name from . import compiler +from . import set_flags from .trainer_factory import TrainerFactory from .trainer_factory import FetchHandlerMonitor import copy @@ -510,6 +511,16 @@ def _is_dy2st_enable_standalone_executor(): ] +def _is_cuda_graph_enable_standalone_executor(): + return framework._cuda_graph_enable_standalone_executor_ in [ + 1, + '1', + True, + 'True', + 'true', + ] + + def _prepare_fleet_executor(): from ..distributed.fleet.proto import fleet_executor_desc_pb2 @@ -844,7 +855,19 @@ class _ExecutorCache: ) build_strategy = compiled_program._build_strategy # print(f"Program before convert:\n {inner_program}", flush=True) + use_cuda_graph = False + # When using cuda graph, the cuda graph preparation logic in PE is not + # executed, but it is processed in the constructor of new executor. + if ( + build_strategy is not None + and build_strategy.allow_cuda_graph_capture + ): + use_cuda_graph = True + build_strategy.allow_cuda_graph_capture = False + set_flags({"FLAGS_new_executor_use_cuda_graph": True}) compiled_program._compile(scope, place) + if use_cuda_graph: + build_strategy.allow_cuda_graph_capture = True ir_graph = framework.IrGraph(compiled_program._graph) converted_program = ir_graph.to_program() @@ -1746,24 +1769,25 @@ class Executor: ) return False - # Unsupported case 4: CUDA Graph + # Unsupported case 4: async mode if ( compiled_program._build_strategy is not None - and compiled_program._build_strategy.allow_cuda_graph_capture + and compiled_program._build_strategy.async_mode ): warnings.warn( - "Standalone executor is not used for CUDA Graph", + "Standalone executor is not used for async mode", UserWarning, ) return False - # Unsupported case 5: async mode + # Unsupported case 5: CUDA Graph if ( compiled_program._build_strategy is not None - and compiled_program._build_strategy.async_mode + and compiled_program._build_strategy.allow_cuda_graph_capture + and not _is_cuda_graph_enable_standalone_executor() ): warnings.warn( - "Standalone executor is not used for async mode", + "Standalone executor is not used for CUDA Graph when FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=0", UserWarning, ) return False @@ -1811,8 +1835,13 @@ class Executor: tensor = core.get_variable_tensor(scope, lr_sheduler._var_name) # NOTE(dev): `tensor.set(data, self.place)` always call TensorCopySync that is a blocking behavior. So we use `_copy_from` to replace it. cpu_tensor = _as_lodtensor(data, core.CPUPlace()) - # for ipu, tensor is allocated on cpu - if core.is_compiled_with_ipu(): + if core.is_cuda_graph_capturing(): + warnings.warn( + "Caution!!! When capturing CUDA Graph, the learning rate scheduler would not " + "take any effect! Please set the learning rate manually before each batch!" + ) + elif core.is_compiled_with_ipu(): + # for ipu, tensor is allocated on cpu tensor._copy_from(cpu_tensor, tensor._place()) else: tensor._copy_from(cpu_tensor, self.place) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 41b9b8bbb2deb74882180263e94cb3eb70709dc9..da2fa96c758bae003952b6c2181b35974231c0bd 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -36,6 +36,7 @@ import paddle.version as fluid_version import warnings import functools from .variable_index import _getitem_impl_, _setitem_impl_ +import threading __all__ = [ 'Program', @@ -70,8 +71,42 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +# use thread local to create thread save global variables. +class GlobalThreadLocal(threading.local): + def __init__(self): + """ + init the thread local data. + TODO(xiongkun): how to access another thread local data ? + """ + global _dygraph_tracer_ + self._in_declarative_mode_ = False + self._functional_dygraph_context_manager = None + self._dygraph_tracer_ = _dygraph_tracer_ + self._in_eager_mode_ = True + + def __str__(self): + strings = [] + strings.append( + "_in_declarative_mode_:" + str(self._in_declarative_mode_) + ) + strings.append( + "_functional_dygraph_context_manager:" + + str(self._functional_dygraph_context_manager) + ) + strings.append("_dygraph_tracer_:" + str(self._dygraph_tracer_)) + strings.append("_in_eager_mode_:" + str(self._in_eager_mode_)) + return "\n".join(strings) + + def __setattr__(self, name, val): + if name == '_dygraph_tracer_': + global _dygraph_tracer_ + _dygraph_tracer_ = val + self.__dict__[name] = val + + _dygraph_tracer_ = None -_in_eager_mode_ = True +global_var = GlobalThreadLocal() + _global_expected_place_ = None _current_device = None global_prog_seed = 0 @@ -86,6 +121,9 @@ _enable_standalone_executor_ = os.environ.get( _dy2st_enable_standalone_executor_ = os.environ.get( 'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 1 ) +_cuda_graph_enable_standalone_executor_ = os.environ.get( + 'FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR', 0 +) # Some explanation of our execution system 2022.03 # For now we have 3 kinds of execution system, since we refactored dygraph mode to @@ -152,20 +190,17 @@ def _switch_tensor_bind_type(is_eager): def _enable_legacy_dygraph(): - global _in_eager_mode_ - _in_eager_mode_ = False + global_var._in_eager_mode_ = False _update_monkey_methods(is_eager=False) def _disable_legacy_dygraph(): - global _in_eager_mode_ - _in_eager_mode_ = True + global_var._in_eager_mode_ = True _update_monkey_methods(is_eager=True) def _in_eager_without_dygraph_check(): - global _in_eager_mode_ - return _in_eager_mode_ + return global_var._in_eager_mode_ # FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but @@ -174,7 +209,6 @@ _is_first_import_ = True def _fallback_legacy_dygraph(): - global _in_eager_mode_ global _is_first_import_ need_fallback = False # Only enable eager on CPU/GPU/XPU @@ -184,12 +218,12 @@ def _fallback_legacy_dygraph(): or core.is_compiled_with_mlu() ) - if _in_eager_mode_ and is_not_support: + if global_var._in_eager_mode_ and is_not_support: # switch into legacy dygraph mode warnings.warn( "We will fallback into legacy dygraph on NPU/XPU/MLU/IPU/ROCM devices. Because we only support new eager dygraph mode on CPU/GPU currently. " ) - _in_eager_mode_ = False + global_var._in_eager_mode_ = False if not _is_first_import_: _enable_legacy_dygraph() need_fallback = True @@ -231,11 +265,13 @@ def in_dygraph_mode(): print(paddle.in_dynamic_mode()) # True, Now we are in dynamic mode """ - return (_dygraph_tracer_ is not None) and _in_eager_mode_ + return ( + global_var._dygraph_tracer_ is not None + ) and global_var._in_eager_mode_ def _non_static_mode(): - return _dygraph_tracer_ is not None + return global_var._dygraph_tracer_ is not None @signature_safe_contextmanager @@ -600,7 +636,7 @@ non_static_only = wrap_decorator(_non_static_only_) def _dygraph_tracer(): - return _dygraph_tracer_ + return global_var._dygraph_tracer_ def _global_flags(): @@ -668,9 +704,8 @@ def _current_expected_place(): def _set_dygraph_tracer_expected_place(place): - global _dygraph_tracer_ - if _dygraph_tracer_ is not None: - _dygraph_tracer_._expected_place = place + if global_var._dygraph_tracer_ is not None: + global_var._dygraph_tracer_._expected_place = place def _set_expected_place(place): @@ -1312,7 +1347,7 @@ def _varbase_creator( if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if _in_eager_mode_: + if global_var._in_eager_mode_: eager_tensor = core.eager.Tensor( dtype if dtype else core.VarDesc.VarType.FP32, list(shape) if shape else [], @@ -7457,16 +7492,17 @@ def _get_var(name, program=None): @signature_safe_contextmanager def _dygraph_guard(tracer): - global _dygraph_tracer_ - tmp_tracer = _dygraph_tracer_ - _dygraph_tracer_ = tracer - core._switch_tracer(tracer) + tmp_tracer = global_var._dygraph_tracer_ + global_var._dygraph_tracer_ = tracer + if tracer is not None: + core._switch_tracer(tracer) try: yield finally: - core._switch_tracer(tmp_tracer) - _dygraph_tracer_ = tmp_tracer + if tmp_tracer is not None: + core._switch_tracer(tmp_tracer) + global_var._dygraph_tracer_ = tmp_tracer @signature_safe_contextmanager diff --git a/python/paddle/fluid/lazy_init.py b/python/paddle/fluid/lazy_init.py index 54755c0787947feb040b95c008d141c3f64452b4..1851056f2c2ce4d08adf6ce255231278e531ebce 100644 --- a/python/paddle/fluid/lazy_init.py +++ b/python/paddle/fluid/lazy_init.py @@ -59,8 +59,8 @@ class LazyInitHelper: self.enable() if self._in_guard: return - self._tracer = framework._dygraph_tracer_ - framework._dygraph_tracer_ = None + self._tracer = framework.global_var._dygraph_tracer_ + framework.global_var._dygraph_tracer_ = None self._in_guard = True def __exit__(self, *args, **kwargs): @@ -71,7 +71,7 @@ class LazyInitHelper: if not self._in_guard: return assert self._tracer is not None - framework._dygraph_tracer_ = self._tracer + framework.global_var._dygraph_tracer_ = self._tracer self._tracer = None self._in_guard = False diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py index 8d8046f19aa79f83ccb1f8757a3099b03b7ec61d..833993c621612f28a3acc895b544e9cc084b4486 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py @@ -21,7 +21,6 @@ import numpy as np import paddle import paddle.static as static -from paddle import fluid from paddle.utils.cpp_extension.extension_utils import run_cmd from paddle.vision.transforms import Compose, Normalize @@ -146,8 +145,10 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): paddle.set_device(device) t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + t.retain_grads() out = func(t) if use_func else paddle.nn.functional.relu(t) + out.retain_grads() dx = paddle.grad( outputs=out, inputs=t, @@ -259,7 +260,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): ) def test_dynamic(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for device in self.devices: for dtype in self.dtypes: if device == 'cpu' and dtype == 'float16': @@ -286,7 +286,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): x_grad, pd_x_grad ), ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_static_save_and_load_inference_model(self): paddle.enable_static() @@ -354,7 +353,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): paddle.disable_static() def test_double_grad_dynamic(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for device in self.devices: for dtype in self.dtypes: if device == 'cpu' and dtype == 'float16': @@ -380,7 +378,6 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): dx_grad, pd_dx_grad ), ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_with_dataloader(self): for device in self.devices: diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py index 7b251e8063a05e7d4a09238feaf1efef04739fe4..ef0f52d5c3f2dafbc6a480aa2c1497c87b793666 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py @@ -30,8 +30,10 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): t = paddle.to_tensor(np_x, dtype=dtype) t.stop_gradient = False + t.retain_grads() out = func(t) if use_func else paddle.nn.functional.relu(t) + out.retain_grads() out.stop_gradient = False out.backward() @@ -142,14 +144,14 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix): def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): - import paddle.fluid as fluid - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) paddle.set_device(device) t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + t.retain_grads() out = func(t) if use_func else paddle.nn.functional.relu(t) + out.retain_grads() dx = paddle.grad( outputs=out, inputs=t, @@ -164,7 +166,6 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): grad_outputs=paddle.ones_like(t), create_graph=False, ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) assert ddout[0].numpy() is not None return dx[0].numpy(), ddout[0].numpy() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py index bedaf36832f91ee600ea7f789ea0ff6b73366a78..ad139e84a90c4dedd49d28567ed9cb83f62ff7b5 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py @@ -19,7 +19,6 @@ import numpy as np from utils import extra_cc_args, extra_nvcc_args, paddle_includes import paddle -import paddle.fluid as fluid from paddle.utils.cpp_extension import get_build_directory, load from paddle.utils.cpp_extension.extension_utils import run_cmd @@ -41,24 +40,25 @@ custom_ops = load( def custom_tanh_double_grad_dynamic(func, device, dtype, np_x): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) paddle.set_device(device) t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + t.retain_grads() out = func(t) out.stop_gradient = False + out.retain_grads() dx = paddle.grad( outputs=[out], inputs=[t], create_graph=True, retain_graph=True ) + dx[0].retain_grads() dx[0].backward() assert out.grad is not None assert dx[0].grad is not None return dx[0].numpy(), dx[0].grad.numpy(), out.grad.numpy() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) class TestCustomTanhDoubleGradJit(unittest.TestCase): @@ -68,7 +68,6 @@ class TestCustomTanhDoubleGradJit(unittest.TestCase): self.devices = ['cpu'] def test_double_grad_dynamic(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for device in self.devices: for dtype in self.dtypes: x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) @@ -102,7 +101,6 @@ class TestCustomTanhDoubleGradJit(unittest.TestCase): dout, pd_dout ), ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py index 969fdb2f8a6b2229d77c805fcf977ee6dbcc926a..b347ee139728abf9029a34af605d966285301d27 100644 --- a/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_op_setup.py @@ -24,11 +24,11 @@ import numpy as np def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): import paddle - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) paddle.set_device(device) t = paddle.to_tensor(np_x, dtype=dtype) t.stop_gradient = False + t.retain_grads() sys.stdout.flush() out = func(t) if use_func else paddle.nn.functional.relu(t) @@ -36,7 +36,6 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): out.backward() - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) if t.grad is None: return out.numpy(), t.grad else: @@ -105,11 +104,12 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): import paddle paddle.set_device(device) - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + t.retain_grads() out = func(t) if use_func else paddle.nn.functional.relu(t) + out.retain_grads() dx = paddle.grad( outputs=out, inputs=t, @@ -125,7 +125,6 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): create_graph=False, ) - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) assert ddout[0].numpy() is not None return dx[0].numpy(), ddout[0].numpy() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d99deb2bfc58019de279f3b41a8c910a1aab5a1..2eea2070befe39a230c4ac77306983abe0be4ca9 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1259,3 +1259,7 @@ set_tests_properties(test_parallel_executor_dry_run PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=0") set_tests_properties(test_parallel_executor_drop_scope PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=0") + +set_tests_properties( + test_cuda_graph_static_mode + PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1") diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py index dea1d1ee2d9e1f71a175d664864aac4493eefb51..13ebddbd786da06a5b7930b533db864f69d4b937 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py @@ -30,8 +30,6 @@ class TestCollectiveAllToAllSingle(unittest.TestCase): paddle.distributed.is_initialized() ), "The distributed environment has been initialized." - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - def test_collective_alltoall_single(self): rank = dist.get_rank() size = dist.get_world_size() diff --git a/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py index ec3b2ad5e4a4b622c111e857dcda5ce73e97d574..13dbd974a1bca81746fbd2e19b7950740409a0c2 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_batch_isend_irecv.py @@ -23,7 +23,6 @@ import paddle.distributed as dist class TestCollectiveBatchIsendIrecv(unittest.TestCase): def setUp(self): dist.init_parallel_env() - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) def test_collective_batch_isend_irecv(self): rank = dist.get_rank() diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py index c2cf243ee02cd7edda79a14a8c578fa825a34790..7017237cacd1e5be5f4e304270c5521233d73962 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py @@ -24,7 +24,6 @@ from paddle.distributed.communication.reduce_scatter import _reduce_scatter_base class TestCollectiveReduceScatter(unittest.TestCase): def setUp(self): dist.init_parallel_env() - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) def test_collective_reduce_scatter_sum(self): rank = dist.get_rank() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py index 78e119fa8fb424693e5cc28d16170a535af4541b..97dd4e39395914d83ac559f3335967fb25743395 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_margin_cross_entropy.py @@ -34,7 +34,6 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) def test_parallel_margin_softmax_cross_entropy(self): margin1s = [1.0, 1.0, 1.35] @@ -93,6 +92,7 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase): norm_weight = paddle.divide(weight, weight_l2) data = paddle.matmul(norm_input, norm_weight) + data.retain_grads() data.stop_gradient = False sta = ( @@ -118,6 +118,7 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase): group=check_group, ) integral_data = integral_data.detach().clone() + integral_data.retain_grads() integral_data.stop_gradient = False # add arcface margin to logit diff --git a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py index 2ac671962c0389a97e32e15851c12e2478d04ea9..c7c876b8f8fea6c79bfdf91c95e4dc16d7cefb93 100644 --- a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py +++ b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax.py @@ -57,8 +57,7 @@ attrs = Attr() def fn(x): - y = paddle.tan(x) - return F.softmax(y, axis=attrs.axis, dtype=attrs.dtype) + return F.softmax(x, axis=attrs.axis, dtype=attrs.dtype) def expect_forward(inputs): @@ -81,8 +80,17 @@ class TestCompositeSoftmax(unittest.TestCase): ) y = fn(x) blocks = main_program.blocks + + fwd_ops = [op.type for op in blocks[0].ops] + # Ensure that softmax in original block + self.assertTrue('softmax' in fwd_ops) + paddle.incubate.autograd.to_prim(blocks) + fwd_ops_new = [op.type for op in blocks[0].ops] + # Ensure that softmax is splitted into small ops + self.assertTrue('softmax' not in fwd_ops_new) + exe = paddle.static.Executor() exe.run(startup_program) res = exe.run(main_program, feed={'x': inputs}, fetch_list=[y]) @@ -97,7 +105,7 @@ class TestCompositeSoftmax(unittest.TestCase): actual = self.cal_composite(np_data)[0] assert expect.dtype == actual.dtype - assert np.allclose( + np.testing.assert_allclose( expect, actual, rtol=attrs.get_rtol("forward"), diff --git a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py index c47399ba5a983034bc5c1ac91cd7b57577f0ef18..808c5f8324b65a87efa5c46005c553f5f58703fb 100644 --- a/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py +++ b/python/paddle/fluid/tests/unittests/composite_ops/test_composite_softmax_grad.py @@ -19,6 +19,7 @@ from utils import TOLERANCE import paddle import paddle.nn.functional as F +from paddle.fluid import core def generate_data(shape, dtype="float32"): @@ -57,11 +58,11 @@ attrs = Attr() def fn(x): - y = paddle.tan(x) - return F.softmax(y, axis=attrs.axis, dtype=attrs.dtype) + return F.softmax(x, axis=attrs.axis, dtype=attrs.dtype) def expect_grad(inputs): + paddle.disable_static() inputs.stop_gradient = False res = fn(inputs) @@ -86,8 +87,22 @@ class TestCompositeSoftmax(unittest.TestCase): x.stop_gradient = False y = fn(x) blocks = main_program.blocks + + fwd_ops = [op.type for op in blocks[0].ops] + # Ensure that softmax in original block + self.assertTrue('softmax' in fwd_ops) + paddle.incubate.autograd.to_prim(blocks) + + fwd_ops_new = [op.type for op in blocks[0].ops] + # Ensure that softmax is splitted into small ops + self.assertTrue('softmax' not in fwd_ops_new) + z = paddle.static.gradients([y], x) + fwd_ops_grad = [op.type for op in blocks[0].ops] + # Ensure that softmax_grad not in grad block + + self.assertTrue('softmax_grad' not in fwd_ops_grad) exe = paddle.static.Executor() exe.run(startup_program) @@ -103,7 +118,7 @@ class TestCompositeSoftmax(unittest.TestCase): actual = self.cal_composite_grad(np_data)[0] assert expect.dtype == actual.dtype - assert np.allclose( + np.testing.assert_allclose( expect, actual, rtol=attrs.get_rtol("backward"), @@ -120,5 +135,59 @@ class TestCompositeSoftmax(unittest.TestCase): self.compare_backward() +class TestCompositeSoftmaxPrimBackward(unittest.TestCase): + "test composite softmax and prim backward" + + def setUp(self): + core.set_prim_enabled(True) + self.dtypes = ["float32"] + self.shapes = [[2, 3, 4], [2, 3]] + self.axes = [-1, 0, 1] + + def cal_composite_grad(self, inputs): + paddle.enable_static() + startup_program = paddle.static.Program() + main_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + x = paddle.static.data( + 'x', shape=inputs.shape, dtype=str(inputs.dtype) + ) + x.stop_gradient = False + y = fn(x) + blocks = main_program.blocks + paddle.incubate.autograd.to_prim(blocks) + z = paddle.static.gradients([y], x) + + exe = paddle.static.Executor() + exe.run(startup_program) + res = exe.run(main_program, feed={'x': inputs}, fetch_list=[z]) + paddle.disable_static() + return res + + def compare_backward(self): + np_data = generate_data(attrs.shape) + tensor_data = paddle.to_tensor(np_data) + + expect = expect_grad(tensor_data)[0].numpy() + actual = self.cal_composite_grad(np_data)[0] + + assert expect.dtype == actual.dtype + np.testing.assert_allclose( + expect, + actual, + rtol=attrs.get_rtol("prim_backward"), + atol=attrs.get_rtol("prim_backward"), + ) + + def test_prim_backward(self): + for i in self.axes: + for j in self.dtypes: + for t in self.shapes: + attrs.set_axis(i) + attrs.set_dtype(j) + attrs.set_shape(t) + self.compare_backward() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/composite_ops/utils.py b/python/paddle/fluid/tests/unittests/composite_ops/utils.py index c43f79d1c053f6b4ce365dd751f3dd034737f836..798da50a1c4367cd30ec58b9b73be1647578e07d 100644 --- a/python/paddle/fluid/tests/unittests/composite_ops/utils.py +++ b/python/paddle/fluid/tests/unittests/composite_ops/utils.py @@ -15,11 +15,13 @@ TOLERANCE = { "float32": { - "forward": {"rtol": 1e-6, "atol": 1e-6}, - "backward": {"rtol": 1e-6, "atol": 1e-6}, - }, - "float64": { "forward": {"rtol": 1e-7, "atol": 1e-7}, "backward": {"rtol": 1e-7, "atol": 1e-7}, + "prim_backward": {"rtol": 1e-6, "atol": 1e-6}, + }, + "float64": { + "forward": {"rtol": 1e-16, "atol": 1e-16}, + "backward": {"rtol": 1e-15, "atol": 1e-15}, + "prim_backward": {"rtol": 1e-15, "atol": 1e-15}, }, } diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py new file mode 100644 index 0000000000000000000000000000000000000000..b1924d84db5898cf68f8aea899f1b74094ff1b67 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py @@ -0,0 +1,45 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + + +class Net(paddle.nn.Layer): + def __init__(self): + super(Net, self).__init__() + + @paddle.jit.to_static + def forward(self, x): + out = x + 1 + return out + + +class TestBackwardWithoutParams(unittest.TestCase): + def test_run(self): + net = Net() + + x = paddle.ones([2, 2]) + x.stop_gradient = False + out = net(x) + loss = paddle.mean(out) + loss.backward() + np.testing.assert_equal(x.grad.numpy(), np.full(x.shape, 0.25)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py index 499f7285f29aad5eb25435b4ad9401b05f392bbf..03f89bb84fc4293eba5d7b1d9a83f6d1e323cc4a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py @@ -36,7 +36,7 @@ class TestDy2staticException(unittest.TestCase): with self.assertRaisesRegex(Dygraph2StaticException, self.error): paddle.jit.enable_to_static(True) self.assertTrue(to_static(self.dyfunc)(self.x)) - paddle.fluid.dygraph.base._in_declarative_mode_ = False + paddle.fluid.dygraph.base.global_var._in_declarative_mode_ = False paddle.jit.enable_to_static(False) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py new file mode 100644 index 0000000000000000000000000000000000000000..2811a348f46561423449eb9f646e750c7935e3cc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cinn_prim.py @@ -0,0 +1,151 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import platform +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.fluid import core + + +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x): + y = self.fc(x) + out = F.softmax(y) + return out + + +class TestPrimForward(unittest.TestCase): + """ + This case only tests prim_forward + to_static + cinn. Thus we need to + set this flag as False to avoid prim_backward. + core.set_prim_backward(False) + """ + + def setUp(self): + core.set_prim_backward(False) + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.x.stop_gradient = False + + def train(self, use_prim): + paddle.seed(2022) + net = PrimeNet() + sgd = paddle.optimizer.SGD( + learning_rate=0.1, parameters=net.parameters() + ) + if use_prim: + net = apply_to_static(net, use_prim) + + res = [] + for _ in range(10): + out = net(self.x) + loss = paddle.mean(out) + loss.backward() + sgd.step() + sgd.clear_grad() + + res.append(out.numpy()) + + self.check_prim(net, use_prim) + + return res + + def check_prim(self, net, use_prim): + if not use_prim: + return + fwd_ops = [op.type for op in net.forward.main_program.block(0).ops] + # Ensure that softmax is splitted into small ops + self.assertTrue('softmax' not in fwd_ops) + + def test_cinn_prim_forward(self): + dy_res = self.train(use_prim=False) + cinn_res = self.train(use_prim=True) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + cinn_res[i], dy_res[i], rtol=1e-7, atol=1e-7 + ) + + +class TestPrimForwardAndBackward(unittest.TestCase): + """ + Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph + """ + + def setUp(self): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.x.stop_gradient = False + + def train(self, use_prim): + core.set_prim_backward(True) + paddle.seed(2022) + net = PrimeNet() + sgd = paddle.optimizer.SGD( + learning_rate=0.1, parameters=net.parameters() + ) + if use_prim: + net = apply_to_static(net, use_prim) + + res = [] + for _ in range(10): + out = net(self.x) + loss = paddle.mean(out) + loss.backward() + sgd.step() + sgd.clear_grad() + + res.append(out.numpy()) + + self.check_prim(net, use_prim) + + return res + + def check_prim(self, net, use_prim): + if not use_prim: + return + fwd_ops = [op.type for op in net.forward.main_program.block(0).ops] + # Ensure that softmax is splitted into small ops + self.assertTrue('softmax' not in fwd_ops) + + def test_cinn_prim(self): + plat = platform.system() + if plat == "Linux": + dy_res = self.train(use_prim=False) + cinn_res = self.train(use_prim=True) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + cinn_res[i], dy_res[i], rtol=1e-6, atol=1e-6 + ) + else: + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py index daf6f0a9aca688e81834eb6e2367b7c7e127ba17..42704cfe289b0ff73db1ae43ae684feee655d6e4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py @@ -377,6 +377,7 @@ class TestTransform(TestTransformBase): if not isinstance(dy_outs, (tuple, list)): dy_outs = (dy_outs,) + self.dygraph_func.eval() st_outs = self.get_static_output() if not isinstance(st_outs, (tuple, list)): st_outs = (st_outs,) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py index 72ed077af3339ec3d8088677aab6f74553d2cf98..d400f15285f8006c31f013d77b8f58cf861942c7 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py @@ -65,7 +65,7 @@ class TestDy2staticException(unittest.TestCase): with self.assertRaisesRegex(Dygraph2StaticException, self.error): paddle.jit.enable_to_static(True) self.assertTrue(paddle.jit.to_static(self.dyfunc)(self.x)) - paddle.fluid.dygraph.base._in_declarative_mode_ = False + paddle.fluid.dygraph.base.global_var._in_declarative_mode_ = False paddle.jit.enable_to_static(False) @@ -463,7 +463,7 @@ class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1): # that the code block is under @to_static, but in this UT # an exception is thrown during Dy2St, making the `_in_declarative_mode_` # a wrong value. So We need set `_in_declarative_mode_` to False manually. - paddle.fluid.dygraph.base._in_declarative_mode_ = False + paddle.fluid.dygraph.base.global_var._in_declarative_mode_ = False paddle.jit.enable_to_static(False) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py index 40919edbce6d4212f5ddc4dac1710c9f480bc737..b195c7d342a724598bfa175c60442d3bca418048 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py @@ -14,6 +14,7 @@ import math import os +import platform import tempfile import time import unittest @@ -450,5 +451,67 @@ class TestResnet(unittest.TestCase): fluid.set_flags({'FLAGS_use_mkldnn': False}) +class TestResnetPrim(unittest.TestCase): + "test prim forward + prim backward + to_static" + + def setUp(self): + self.resnet_helper = ResNetHelper() + + def train(self, to_static): + paddle.jit.enable_to_static(to_static) + return self.resnet_helper.train(to_static) + + def verify_predict(self): + image = np.random.random([1, 3, 224, 224]).astype('float32') + dy_pre = self.resnet_helper.predict_dygraph(image) + st_pre = self.resnet_helper.predict_static(image) + dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image) + predictor_pre = self.resnet_helper.predict_analysis_inference(image) + np.testing.assert_allclose( + dy_pre, + st_pre, + rtol=1e-05, + err_msg='dy_pre:\n {}\n, st_pre: \n{}.'.format(dy_pre, st_pre), + ) + np.testing.assert_allclose( + dy_jit_pre, + st_pre, + rtol=1e-05, + err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format( + dy_jit_pre, st_pre + ), + ) + np.testing.assert_allclose( + predictor_pre, + st_pre, + rtol=1e-05, + err_msg='predictor_pre:\n {}\n, st_pre: \n{}.'.format( + predictor_pre, st_pre + ), + ) + + def test_resnet_composite(self): + plat = platform.system() + if plat == "Linux": + print("=================== origin resnet ===================") + core.set_prim_enabled(False) + static_loss = self.train(to_static=True) + print("======= resnet with prim forward and backward =======") + core.set_prim_enabled(True) + core.set_prim_forward("debug") + dygraph_loss = self.train(to_static=True) + np.testing.assert_allclose( + static_loss, + dygraph_loss, + rtol=1e-02, + err_msg='static_loss: {} \n dygraph_loss: {}'.format( + static_loss, dygraph_loss + ), + ) + core.set_prim_enabled(False) + else: + pass + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index d5541666e9e99763447484d059fade5e72c59149..d456a86aa9d28cdbded3da5b40be6036cf5f9777 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -232,7 +232,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_mkldnn_conv_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120) - set_tests_properties(test_mkldnn_reshape_transpose_matmul_fuse_pass + set_tests_properties(test_onednn_reshape_transpose_matmul_fuse_pass PROPERTIES TIMEOUT 100) set_tests_properties(test_mkldnn_mish_op PROPERTIES TIMEOUT 300) set_tests_properties(test_mkldnn_conv3d_op PROPERTIES TIMEOUT 300) @@ -240,7 +240,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_conv_transpose_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 250) - set_tests_properties(test_mkldnn_matmul_transpose_reshape_fuse_pass + set_tests_properties(test_onednn_matmul_transpose_reshape_fuse_pass PROPERTIES TIMEOUT 100) set_tests_properties(test_conv_transpose_bn_fuse_pass PROPERTIES TIMEOUT 300) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py similarity index 70% rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py index a2d2260683020777431c572d5ed8104a3991afce..85cdfd314a7cf456fc938b8d602fa748490080fc 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py @@ -21,7 +21,7 @@ from auto_scan_test import PassAutoScanTest from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest): +class TestOneDNNMatmulTransposeReshapeFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) @@ -57,42 +57,42 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest): shape_x = [batch_size, channel, 32, input_dim] shape_y = [batch_size, channel, input_dim, 16] - if type == "x": + if type == 'x': return np.random.random(shape_x).astype(np.float32) else: return np.random.random(shape_y).astype(np.float32) matmul_op = OpConfig( - type="matmul", - inputs={"X": ["input_data1"], "Y": ["input_data2"]}, - outputs={"Out": ["matmul_output"]}, + type='matmul', + inputs={'X': ['input_data1'], 'Y': ['input_data2']}, + outputs={'Out': ['matmul_output']}, attrs={ - "transpose_X": transpose_X, - "transpose_Y": transpose_Y, - "alpha": alpha, - "fused_reshape_X": [], - "fused_reshape_Y": [], - "fused_transpose_X": [], - "fused_transpose_Y": [], - "fused_reshape_Out": [], - "fused_transpose_Out": [], + 'transpose_X': transpose_X, + 'transpose_Y': transpose_Y, + 'alpha': alpha, + 'fused_reshape_X': [], + 'fused_reshape_Y': [], + 'fused_transpose_X': [], + 'fused_transpose_Y': [], + 'fused_reshape_Out': [], + 'fused_transpose_Out': [], }, ) transpose2_op = OpConfig( - type="transpose2", - inputs={"X": ["matmul_output"]}, + type='transpose2', + inputs={'X': ['matmul_output']}, outputs={ - "Out": ["transpose2_output"], - "XShape": ["transpose2_xshape"], + 'Out': ['transpose2_output'], + 'XShape': ['transpose2_xshape'], }, attrs={'axis': axis}, ) reshape2_op = OpConfig( - type="reshape2", - inputs={"X": ["transpose2_output"]}, - outputs={"Out": ["reshape2_output"], "XShape": ["reshape2_xshape"]}, + type='reshape2', + inputs={'X': ['transpose2_output']}, + outputs={'Out': ['reshape2_output'], 'XShape': ['reshape2_xshape']}, attrs={'shape': shape}, ) @@ -102,27 +102,27 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest): ops=model_net, weights={}, inputs={ - "input_data1": TensorConfig( - data_gen=partial(generate_input, "x") + 'input_data1': TensorConfig( + data_gen=partial(generate_input, 'x') ), - "input_data2": TensorConfig( - data_gen=partial(generate_input, "y") + 'input_data2': TensorConfig( + data_gen=partial(generate_input, 'y') ), }, - outputs=["reshape2_output"], + outputs=['reshape2_output'], ) return program_config def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul"], (1e-5, 1e-5) + yield config, ['matmul'], (1e-5, 1e-5) def test(self): self.run_and_statis( - quant=False, passes=["matmul_transpose_reshape_mkldnn_fuse_pass"] + quant=False, passes=['matmul_transpose_reshape_mkldnn_fuse_pass'] ) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py similarity index 70% rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py index fc4d80060756c75e7b15131e8147047a138cd24d..2f9051fe16b5c34546a1eb35e9b85ab725918d8c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py @@ -20,10 +20,11 @@ import numpy as np from auto_scan_test import PassAutoScanTest from program_config import ProgramConfig, TensorConfig -num = 32 * 64 +class TestOneDNNReshapeTransposeMatmulFusePass(PassAutoScanTest): + def setUp(self): + self.num = 32 * 64 -class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -40,11 +41,11 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): input_dim = draw(st.sampled_from([32, 64])) def generate_input1(attrs): - shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], num] + shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], self.num] return np.random.random(shape_x).astype(np.float32) def generate_input2(attrs): - shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], num] + shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], self.num] input_volume = reduce(lambda x, y: x * y, shape_x) matmul_shape = [i for i in attrs[0]['shape']] if 0 in matmul_shape: @@ -66,7 +67,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): matmul_shape[0], matmul_shape[1], matmul_shape[-1], - int(num / matmul_shape[-1]), + int(self.num / matmul_shape[-1]), ] elif attrs[2]['transpose_X']: shape_y = matmul_shape @@ -77,17 +78,17 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): matmul_shape[0], matmul_shape[1], matmul_shape[-1], - int(num / matmul_shape[-1]), + int(self.num / matmul_shape[-1]), ] return np.random.random(shape_y).astype(np.float32) attrs = [ - {"shape": shape}, - {"axis": axis}, + {'shape': shape}, + {'axis': axis}, { - "transpose_X": transpose_X, - "transpose_Y": transpose_Y, - "alpha": alpha, + 'transpose_X': transpose_X, + 'transpose_Y': transpose_Y, + 'alpha': alpha, }, { 'batch_size': batch_size, @@ -98,37 +99,37 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): ops_config = [ { - "op_type": "reshape2", - "op_inputs": {"X": ["input_data1"]}, - "op_outputs": { - "Out": ["reshape2_output"], - "XShape": ["reshape2_xshape"], + 'op_type': 'reshape2', + 'op_inputs': {'X': ['input_data1']}, + 'op_outputs': { + 'Out': ['reshape2_output'], + 'XShape': ['reshape2_xshape'], }, - "op_attrs": {'shape': attrs[0]['shape']}, + 'op_attrs': {'shape': attrs[0]['shape']}, }, { - "op_type": "transpose2", - "op_inputs": {"X": ["reshape2_output"]}, - "op_outputs": { - "Out": ["transpose2_output"], - "XShape": ["transpose2_xshape"], + 'op_type': 'transpose2', + 'op_inputs': {'X': ['reshape2_output']}, + 'op_outputs': { + 'Out': ['transpose2_output'], + 'XShape': ['transpose2_xshape'], }, - "op_attrs": {'axis': attrs[1]['axis']}, + 'op_attrs': {'axis': attrs[1]['axis']}, }, { - "op_type": "matmul", - "op_inputs": {"X": ["transpose2_output"], "Y": ["input_data2"]}, - "op_outputs": {"Out": ["matmul_output"]}, - "op_attrs": { + 'op_type': 'matmul', + 'op_inputs': {'X': ['transpose2_output'], 'Y': ['input_data2']}, + 'op_outputs': {'Out': ['matmul_output']}, + 'op_attrs': { 'transpose_X': attrs[2]['transpose_X'], 'transpose_Y': attrs[2]['transpose_Y'], 'alpha': attrs[2]['alpha'], - "fused_reshape_X": [], - "fused_reshape_Y": [], - "fused_transpose_X": [], - "fused_transpose_Y": [], - "fused_reshape_Out": [], - "fused_transpose_Out": [], + 'fused_reshape_X': [], + 'fused_reshape_Y': [], + 'fused_transpose_X': [], + 'fused_transpose_Y': [], + 'fused_reshape_Out': [], + 'fused_transpose_Out': [], }, }, ] @@ -139,27 +140,27 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): ops=ops, weights={}, inputs={ - "input_data1": TensorConfig( + 'input_data1': TensorConfig( data_gen=partial(generate_input1, attrs) ), - "input_data2": TensorConfig( + 'input_data2': TensorConfig( data_gen=partial(generate_input2, attrs) ), }, - outputs=["matmul_output"], + outputs=['matmul_output'], ) return program_config def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul"], (1e-5, 1e-5) + yield config, ['matmul'], (1e-5, 1e-5) def test(self): self.run_and_statis( - quant=False, passes=["reshape_transpose_matmul_mkldnn_fuse_pass"] + quant=False, passes=['reshape_transpose_matmul_mkldnn_fuse_pass'] ) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py index 94faec4d530a63b874e53c16904699787357b067..69769bbdc1f08e36b0b82a931f15f277c667c209 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py @@ -25,7 +25,7 @@ from paddle import _C_ops, _legacy_C_ops import paddle.fluid as fluid from paddle.fluid import core, framework, executor from paddle.fluid.layers.utils import _hash_with_id -from paddle.fluid.framework import _in_eager_mode_ +from paddle.fluid.framework import global_var paddle.enable_static() np.random.seed(1243) @@ -135,7 +135,7 @@ class RunProgramNPUOpTest(unittest.TestCase): def prepare_dygraph_input(self, place, return_param_list=False): def create_var_base(is_input, name, np_value, stop_gradient): - if _in_eager_mode_: + if global_var._in_eager_mode_: var = core.eager.Tensor( value=np_value, name=name, place=place, zero_copy=True ) @@ -176,7 +176,7 @@ class RunProgramNPUOpTest(unittest.TestCase): for name in self.output_names['Out']: outputs['Out'].append(create_var_base(False, name)) - if _in_eager_mode_: + if global_var._in_eager_mode_: outputs['OutScope'] = [core.Scope()] else: outputs['OutScope'] = framework._varbase_creator( diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt index 7d5fc1006d1e8ef175256a2a196482c199b98c48..863a484c466f189d9ae31f4f3a0c9b7cb84373ec 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/CMakeLists.txt @@ -8,10 +8,3 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach() - -set_tests_properties(test_comp_eager_tanh_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_eager_div_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_eager_sum_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_eager_add_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_eager_sub_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_eager_sqrt_grad PROPERTIES TIMEOUT 60) diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..e81314ba041ef7f91f0bdb4f1c266d4bcc92bb72 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_exp_grad.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import autograd +import autograd.numpy +import numpy as np +import parameterized as param + +import paddle +from paddle.fluid import core + + +@param.parameterized_class( + ('primal', 'cotangent', 'dtype'), + [ + (np.random.rand(10, 10), np.random.rand(10, 10), np.float32), + ], +) +class TestExpGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + core.set_prim_enabled(True) + cls.primal = cls.primal.astype(cls.dtype) + if cls.cotangent is not None: + cls.cotangent = cls.cotangent.astype(cls.dtype) + + @classmethod + def tearDownClass(cls): + core.set_prim_enabled(False) + + def test_exp_grad_comp(self): + def actual(primal, cotangent): + primal = paddle.to_tensor(primal) + primal.stop_gradient = False + return paddle.grad( + paddle.exp(primal), primal, paddle.to_tensor(cotangent) + )[0] + + def desired(primal, cotangent): + cotangent = ( + np.ones_like(cotangent, dtype=primal.dtype) + if cotangent is None + else cotangent + ) + return autograd.make_vjp(autograd.numpy.exp)(primal)[0](cotangent) + + np.testing.assert_allclose( + actual=actual(self.primal, self.cotangent), + desired=desired(self.primal, self.cotangent), + rtol=1e-6, + atol=0, + ) + + def test_stop_gradients(self): + with self.assertRaises(ValueError): + primal = paddle.to_tensor(self.primal) + primal.stop_gradient = True + return paddle.grad( + paddle.exp(primal), primal, paddle.to_tensor(self.cotangent) + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..c4de565dc504f73c9e505d091c4e05ec06798d57 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_expand_grad.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import parameterized as param + +import paddle +from paddle.fluid import core + + +@param.parameterized_class( + ('name', 'primal', 'cotangent', 'shape', 'dtype'), + ( + ( + 'same_shape', + np.random.rand(10, 10), + np.random.rand(10, 10), + (10, 10), + np.float32, + ), + ( + 'same_rank', + np.random.rand(1, 10), + np.random.rand(10, 10), + (10, 10), + np.float32, + ), + ( + 'same_rank', + np.random.rand(10, 1, 10, 1), + np.random.rand(10, 10, 10, 10), + (10, 10, 10, 10), + np.float32, + ), + ( + 'diff_rank', + np.random.rand(1, 10, 1), + np.random.rand(10, 10, 10, 10), + (10, 10, 10, 10), + np.float32, + ), + ), +) +class TestExpandGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.primal = cls.primal.astype(cls.dtype) + cls.cotangent = cls.cotangent.astype(cls.dtype) + + @classmethod + def tearDownClass(cls): + core.set_prim_enabled(False) + + def test_comp(self): + def func(primal, cotangent, shape): + primal = paddle.to_tensor(primal) + primal.stop_gradient = False + cotangent = paddle.to_tensor(cotangent) + return paddle.grad(paddle.expand(primal, shape), primal, cotangent)[ + 0 + ] + + def actual(primal, cotangent, shape): + core.set_prim_enabled(True) + return func(primal, cotangent, shape) + + def desired(primal, cotangent, shape): + core.set_prim_enabled(False) + return func(primal, cotangent, shape) + + np.testing.assert_allclose( + actual=actual(self.primal, self.cotangent, self.shape), + desired=desired(self.primal, self.cotangent, self.shape), + rtol=1e-6, + atol=0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..59daf91ab8b84b391e971ae6c28b75ea7e05b89f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/eager/test_comp_eager_multiply_grad.py @@ -0,0 +1,100 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import parameterized as param + +import paddle +from paddle.fluid import core + + +@param.parameterized_class( + ('name', 'primals', 'stop_gradients', 'cotangents', 'dtype'), + ( + ( + 'test_normal_case', + (np.random.rand(2, 3, 4), np.random.rand(2, 3, 4)), + (False, False), + (np.random.rand(2, 3, 4),), + np.float32, + ), + ( + 'test_broadcast_diff_rank', + (np.random.rand(2, 3, 1, 4), np.random.rand(3, 3, 4)), + (False, False), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ( + 'test_broadcast_same_rank', + (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)), + (False, False), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ( + 'test_stop_gradient', + (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)), + (False, True), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ( + 'test_reduce_axe_empty', + (np.random.rand(2, 3, 3, 4), np.random.rand(2, 1, 3, 4)), + (False, False), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ), +) +class TestMultiplyGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.primals = tuple(primal.astype(cls.dtype) for primal in cls.primals) + cls.cotangents = tuple(co.astype(cls.dtype) for co in cls.cotangents) + + def as_tuple(self, x): + return (x,) if isinstance(x, paddle.Tensor) else x + + def vjp(self): + primals, cotangents = self.primals, self.cotangents + primals = tuple(paddle.to_tensor(primal) for primal in primals) + for primal, flag in zip(primals, self.stop_gradients): + primal.stop_gradient = flag + cotangents = tuple(paddle.to_tensor(co) for co in cotangents) + out = self.as_tuple(paddle.multiply(*primals)) + grads = paddle.grad(out, primals, cotangents, allow_unused=True) + return [g for g in grads if g is not None] + + def test_comp(self): + core.set_prim_enabled(True) + actual = self.vjp() + + core.set_prim_enabled(False) + desired = self.vjp() + + for i, j in zip(actual, desired): + np.testing.assert_allclose( + i, + j, + rtol=1e-6, + atol=0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py index 9c392663be093e966f04783172e4698a73073a8c..b7d7969d9aa0469d98e8d460c25bc17058235648 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py @@ -21,6 +21,23 @@ import paddle from paddle.fluid import core +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x, y): + tmp = self.fc(x) + out = paddle.add(tmp, y) + return out + + @param.parameterized_class( ('primal0', 'primal1', 'dtype'), [ @@ -51,17 +68,39 @@ from paddle.fluid import core ), ], ) -class TestDivGradComp(unittest.TestCase): +class TestAddGradComp(unittest.TestCase): @classmethod def setUpClass(cls): cls.primal0 = cls.primal0.astype(cls.dtype) cls.primal1 = cls.primal1.astype(cls.dtype) - def setUp(self): - paddle.enable_static() + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.y = paddle.randn([2, 4]) + self.x.stop_gradient = False + self.y.stop_gradient = False + net = PrimeNet() + core.set_prim_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x, self.y) + res = paddle.autograd.grad(out, [self.x, self.y]) + + return res - def tearDown(self): + def test_cinn(self): paddle.disable_static() + dy_res = self.train(use_prim=False, use_cinn=False) + comp_st_cinn_res = self.train(use_prim=True, use_cinn=False) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + comp_st_cinn_res[i].numpy(), + dy_res[i].numpy(), + rtol=1e-7, + atol=1e-7, + ) + paddle.enable_static() def test_tanh_grad_comp(self): def actual(primal0, primal1): @@ -73,8 +112,7 @@ class TestDivGradComp(unittest.TestCase): x.stop_gradient = False y.stop_gradient = False z = paddle.add(x, y) - out = paddle.tanh(z) - res = paddle.static.gradients([out], [x, y]) + res = paddle.static.gradients([z], [x, y]) exe = paddle.static.Executor() exe.run(sp) out = exe.run( @@ -100,8 +138,7 @@ class TestDivGradComp(unittest.TestCase): x.stop_gradient = False y.stop_gradient = False z = paddle.add(x, y) - out = paddle.tanh(z) - res = paddle.static.gradients([out], [x, y]) + res = paddle.static.gradients([z], [x, y]) exe = paddle.static.Executor() exe.run(sp) out = exe.run( diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py index 0325768917e43099cc8848fbe8a726e6630b0359..45cae351a73ebb98d93e526efc26e148a96ef764 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py @@ -21,6 +21,24 @@ import paddle from paddle.fluid import core +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x, y): + tmp = self.fc(x) + out = paddle.add(tmp, y) + res = paddle.tanh(out) + return res + + @param.parameterized_class( ('primal0', 'primal1', 'dtype'), [ @@ -57,13 +75,37 @@ class TestDivGradComp(unittest.TestCase): cls.primal0 = cls.primal0.astype(cls.dtype) cls.primal1 = cls.primal1.astype(cls.dtype) - def setUp(self): - paddle.enable_static() + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.y = paddle.randn([2, 4]) + self.x.stop_gradient = False + self.y.stop_gradient = False + net = PrimeNet() + core.set_prim_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x, self.y) + res = paddle.autograd.grad(out, [self.x, self.y]) + + return res - def tearDown(self): + def test_cinn(self): paddle.disable_static() + dy_res = self.train(use_prim=False, use_cinn=False) + comp_st_cinn_res = self.train(use_prim=True, use_cinn=False) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + comp_st_cinn_res[i].numpy(), + dy_res[i].numpy(), + rtol=1e-7, + atol=1e-7, + ) + paddle.enable_static() def test_tanh_grad_comp(self): + paddle.enable_static() + def actual(primal0, primal1): core.set_prim_enabled(True) mp, sp = paddle.static.Program(), paddle.static.Program() @@ -73,7 +115,8 @@ class TestDivGradComp(unittest.TestCase): x.stop_gradient = False y.stop_gradient = False z = paddle.add(x, y) - res = paddle.static.gradients([z], [x, y]) + out = paddle.tanh(z) + res = paddle.static.gradients([out], [x, y]) exe = paddle.static.Executor() exe.run(sp) out = exe.run( @@ -99,7 +142,8 @@ class TestDivGradComp(unittest.TestCase): x.stop_gradient = False y.stop_gradient = False z = paddle.add(x, y) - res = paddle.static.gradients([z], [x, y]) + out = paddle.tanh(z) + res = paddle.static.gradients([out], [x, y]) exe = paddle.static.Executor() exe.run(sp) out = exe.run( @@ -129,6 +173,7 @@ class TestDivGradComp(unittest.TestCase): atol=0, ) core.set_prim_enabled(False) + paddle.disable_static() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py index fde1f4549d6934d5fdc4e8a41c0bdb65e7a5641e..1d675e8bd097968ed660f52de0c1f658803837c6 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py @@ -21,6 +21,23 @@ import paddle from paddle.fluid import core +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x, y): + tmp = self.fc(x) + out = paddle.divide(tmp, y) + return out + + @param.parameterized_class( ('primal0', 'primal1', 'dtype'), [ @@ -57,11 +74,33 @@ class TestDivGradComp(unittest.TestCase): cls.primal0 = cls.primal0.astype(cls.dtype) cls.primal1 = cls.primal1.astype(cls.dtype) - def setUp(self): - paddle.enable_static() + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.y = paddle.randn([2, 4]) + self.x.stop_gradient = False + self.y.stop_gradient = False + net = PrimeNet() + core.set_prim_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x, self.y) + res = paddle.autograd.grad(out, [self.x, self.y]) + + return res - def tearDown(self): + def test_cinn(self): paddle.disable_static() + dy_res = self.train(use_prim=False, use_cinn=False) + comp_st_cinn_res = self.train(use_prim=True, use_cinn=False) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + comp_st_cinn_res[i].numpy(), + dy_res[i].numpy(), + rtol=1e-6, + atol=1e-6, + ) + paddle.enable_static() def test_tanh_grad_comp(self): def actual(primal0, primal1): diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..c1c76631232c007f4a2a81cb2227035910bb57d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_exp_grad.py @@ -0,0 +1,122 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import autograd +import autograd.numpy +import numpy as np +import parameterized as param + +import paddle +from paddle.fluid import core + + +@param.parameterized_class( + ('primal', 'cotangent', 'dtype'), + [ + (np.random.rand(10, 10), np.random.rand(10, 10), np.float32), + (np.random.rand(10, 10), None, np.float32), + ], +) +class TestExpGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + core.set_prim_enabled(True) + cls.primal = cls.primal.astype(cls.dtype) + if cls.cotangent is not None: + cls.cotangent = cls.cotangent.astype(cls.dtype) + + @classmethod + def tearDownClass(cls): + core.set_prim_enabled(False) + + def setUp(self): + paddle.enable_static() + + def tearDown(self): + paddle.disable_static() + + def test_exp_grad_comp(self): + def actual(primal, cotangent): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = False + v = ( + None + if cotangent is None + else paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + ) + y = paddle.exp(x) + x_cotangent = paddle.static.gradients(y, x, v) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=x_cotangent, + )[0] + + def desired(primal, cotangent): + cotangent = ( + np.ones_like(cotangent, dtype=primal.dtype) + if cotangent is None + else cotangent + ) + return autograd.make_vjp(autograd.numpy.exp)(primal)[0](cotangent) + + np.testing.assert_allclose( + actual=actual(self.primal, self.cotangent), + desired=desired(self.primal, self.cotangent), + rtol=1e-6, + atol=0, + ) + + def test_stop_gradient(self): + def actual(primal, cotangent): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = True + v = ( + None + if cotangent is None + else paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + ) + y = paddle.exp(x) + x_cotangent = paddle.static.gradients(y, x, v) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=x_cotangent, + ) + + def desired(primal, cotangent): + return [] + + self.assertEqual( + actual(self.primal, self.cotangent), + desired(self.primal, self.cotangent), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..c322074d34d88715aa9aec84ca5ca6e05c88aba8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_expand_grad.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import parameterized as param + +import paddle +from paddle.fluid import core + + +@param.parameterized_class( + ('name', 'primal', 'cotangent', 'shape', 'dtype'), + ( + ( + 'same_shape', + np.random.rand(10, 10), + np.random.rand(10, 10), + (10, 10), + np.float32, + ), + ( + 'same_rank', + np.random.rand(1, 10), + np.random.rand(10, 10), + (10, 10), + np.float32, + ), + ( + 'same_rank', + np.random.rand(10, 1, 10, 1), + np.random.rand(10, 10, 10, 10), + (10, 10, 10, 10), + np.float32, + ), + ( + 'diff_rank', + np.random.rand(1, 10, 1), + np.random.rand(10, 10, 10, 10), + (10, 10, 10, 10), + np.float32, + ), + ( + 'single_direction_broadcast', + np.random.rand(10, 10, 10, 10), + np.random.rand(1, 10, 1), + (10, 10, 10, 10), + np.float32, + ), + ), +) +class TestExpandGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.primal = cls.primal.astype(cls.dtype) + cls.cotangent = cls.cotangent.astype(cls.dtype) + paddle.enable_static() + + @classmethod + def tearDownClass(cls): + paddle.disable_static() + core.set_prim_enabled(False) + + def test_comp(self): + def func(primal, cotangent, shape): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = False + v = paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + y = paddle.expand(x, shape) + x_cotangent = paddle.static.gradients(y, x) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=x_cotangent, + )[0] + + def actual(primal, cotangent, shape): + core.set_prim_enabled(True) + return func(primal, cotangent, shape) + + def desired(primal, cotangent, shape): + core.set_prim_enabled(False) + return func(primal, cotangent, shape) + + np.testing.assert_allclose( + actual=actual(self.primal, self.cotangent, self.shape), + desired=desired(self.primal, self.cotangent, self.shape), + rtol=1e-6, + atol=0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..63e8a4f1bbf3451bed5c9402a40ffa13a0bbd319 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py @@ -0,0 +1,128 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import parameterized as param + +import paddle +from paddle.fluid import core, framework + + +@param.parameterized_class( + ('name', 'primals', 'stop_gradients', 'cotangents', 'dtype'), + ( + ( + 'test_normal_case', + (np.random.rand(2, 3, 4), np.random.rand(2, 3, 4)), + (False, False), + (np.random.rand(2, 3, 4),), + np.float32, + ), + ( + 'test_broadcast_diff_rank', + (np.random.rand(2, 3, 1, 4), np.random.rand(3, 3, 4)), + (False, False), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ( + 'test_broadcast_same_rank', + (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)), + (False, False), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ( + 'test_stop_gradient', + (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)), + (False, True), + (np.random.rand(2, 3, 3, 4),), + np.float32, + ), + ( + 'test_reduce_axe_empty', + (np.random.rand(2, 3, 3, 4), np.random.rand(2, 1, 3, 4)), + (False, False), + (np.random.rand(2, 1, 3, 1),), + np.float32, + ), + ), +) +class TestMultiplyGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.primals = tuple(primal.astype(cls.dtype) for primal in cls.primals) + cls.cotangents = tuple(co.astype(cls.dtype) for co in cls.cotangents) + + def setUp(self): + paddle.enable_static() + + def tearDown(self): + paddle.disable_static() + + def as_tuple(self, x): + return (x,) if isinstance(x, framework.Variable) else x + + def vjp(self): + primals, cotangents = self.primals, self.cotangents + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + primals = tuple( + paddle.static.data(f'primal{i}', primal.shape, primal.dtype) + for i, primal in enumerate(primals) + ) + for primal, flag in zip(primals, self.stop_gradients): + primal.stop_gradient = flag + cotangents = tuple( + paddle.static.data(f'cotangent{i}', co.shape, co.dtype) + for i, co in enumerate(cotangents) + ) + out = self.as_tuple(paddle.multiply(*primals)) + grads = paddle.static.gradients(out, primals) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={ + **{ + f'primal{i}': primal + for i, primal in enumerate(self.primals) + }, + **{f'cotangent{i}': co for i, co in enumerate(self.cotangents)}, + }, + fetch_list=[g for g in grads if g is not None], + ) + + def test_comp(self): + + core.set_prim_enabled(True) + actual = self.vjp() + + core.set_prim_enabled(False) + desired = self.vjp() + + self.assertEqual(len(actual), len(desired)) + for i, j in zip(actual, desired): + np.testing.assert_allclose( + i, + j, + rtol=1e-6, + atol=0, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py index 2eae9c86e25fba10f97d5e64ba3e4098abb2a671..505a4391138e95adb376924499cb95bc43fcb5cb 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py @@ -26,6 +26,23 @@ import parameterized as param import paddle +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x): + tmp = self.fc(x) + out = paddle.sqrt(tmp) + return out + + @param.parameterized_class( ('primal', 'cotangent', 'dtype'), [ @@ -38,11 +55,31 @@ class TestSqrtGradComp(unittest.TestCase): cls.primal = cls.primal.astype(cls.dtype) cls.cotangent = cls.cotangent.astype(cls.dtype) - def setUp(self): - paddle.enable_static() + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.x.stop_gradient = False + net = PrimeNet() + core.set_prim_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x) + res = paddle.autograd.grad(out, [self.x]) + + return res - def tearDown(self): + def test_cinn(self): paddle.disable_static() + dy_res = self.train(use_prim=False, use_cinn=False) + comp_st_cinn_res = self.train(use_prim=True, use_cinn=False) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + comp_st_cinn_res[i].numpy(), + dy_res[i].numpy(), + rtol=1e-7, + atol=1e-7, + ) + paddle.enable_static() def test_sqrt_grad_comp(self): def actual(primal, cotangent): diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py index d7cab193a9910e2ad2999efb3743514229445400..f98a6af621f96f32e99f6d5f46afd5c297e6a528 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py @@ -21,6 +21,23 @@ import paddle from paddle.fluid import core +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x, y): + tmp = self.fc(x) + out = paddle.subtract(tmp, y) + return out + + @param.parameterized_class( ('primal0', 'primal1', 'dtype'), [ @@ -39,14 +56,15 @@ from paddle.fluid import core np.random.rand(2, 3, 1, 4), np.float32, ), + (np.random.rand(2, 3, 3, 4), np.random.rand(2, 3, 1, 4), np.float32), ( - np.random.rand(2, 3, 3, 4), + np.random.rand(2, 1, 3, 4), np.random.rand(2, 3, 1, 4), np.float32, ), ( np.random.rand(2, 3, 3, 4), - np.random.rand(2, 3, 1, 1), + np.random.rand(2, 1, 1, 4), np.float32, ), ], @@ -57,11 +75,33 @@ class TestDivGradComp(unittest.TestCase): cls.primal0 = cls.primal0.astype(cls.dtype) cls.primal1 = cls.primal1.astype(cls.dtype) - def setUp(self): - paddle.enable_static() + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.y = paddle.randn([2, 4]) + self.x.stop_gradient = False + self.y.stop_gradient = False + net = PrimeNet() + core.set_prim_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x, self.y) + res = paddle.autograd.grad(out, [self.x, self.y]) + + return res - def tearDown(self): + def test_cinn(self): paddle.disable_static() + dy_res = self.train(use_prim=False, use_cinn=False) + comp_st_cinn_res = self.train(use_prim=True, use_cinn=False) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + comp_st_cinn_res[i].numpy(), + dy_res[i].numpy(), + rtol=1e-7, + atol=1e-7, + ) + paddle.enable_static() def test_tanh_grad_comp(self): def actual(primal0, primal1): diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py index 445b371b0a5a71711cc95a4f060a65fecf8dde11..c7c9109eeaab0403b87e74a0d9edea8ebe995e21 100644 --- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py +++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py @@ -26,6 +26,23 @@ import parameterized as param import paddle +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static(net, build_strategy=build_strategy) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super(PrimeNet, self).__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x): + tmp = self.fc(x) + out = paddle.tanh(tmp) + return out + + @param.parameterized_class( ('primal', 'cotangent', 'dtype'), [ @@ -38,11 +55,31 @@ class TestTanhGradComp(unittest.TestCase): cls.primal = cls.primal.astype(cls.dtype) cls.cotangent = cls.cotangent.astype(cls.dtype) - def setUp(self): - paddle.enable_static() + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.x.stop_gradient = False + net = PrimeNet() + core.set_prim_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x) + res = paddle.autograd.grad(out, [self.x]) + + return res - def tearDown(self): + def test_cinn(self): paddle.disable_static() + dy_res = self.train(use_prim=False, use_cinn=False) + comp_st_cinn_res = self.train(use_prim=True, use_cinn=False) + + for i in range(len(dy_res)): + np.testing.assert_allclose( + comp_st_cinn_res[i].numpy(), + dy_res[i].numpy(), + rtol=1e-7, + atol=1e-7, + ) + paddle.enable_static() def test_tanh_grad_comp(self): def actual(primal, cotangent): diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 5026ae9fc96d478e78d5596a22a507c07dde18b1..48349cfe910b35f35e1e19399c39bd40a0b63726 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -96,11 +96,9 @@ class TestTanhTripleGradCheck(unittest.TestCase): gradient_checker.triple_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.triple_grad_check_for_dygraph( self.tanh_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -128,11 +126,9 @@ class TestTanhDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.tanh_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -160,11 +156,9 @@ class TestAbsDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.abs_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -256,11 +250,9 @@ class TestELUDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.elu_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -292,11 +284,9 @@ class TestCELUDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.celu_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -390,11 +380,9 @@ class TestSquareDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.square_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -424,11 +412,9 @@ class TestLogDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.log_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -456,11 +442,9 @@ class TestSinDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.sin_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() @@ -488,11 +472,9 @@ class TestCosDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.cos_wrapper, [x], y, x_init=x_arr, place=place ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_grad(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py index 078811b4969995f7db2fb112f10ab63a2404b881..02f649b39bfab231fe0691c39b217e017ebf3a9a 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op.py @@ -37,16 +37,12 @@ class TestAssignOp(op_test.OpTest): def test_forward(self): paddle.enable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.check_output(check_eager=True) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.disable_static() def test_backward(self): paddle.enable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.check_grad(['X'], 'Out', check_eager=True) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.disable_static() @@ -60,23 +56,18 @@ class TestAssignFP16Op(op_test.OpTest): def test_forward(self): paddle.enable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.check_output(check_eager=True) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.disable_static() def test_backward(self): paddle.enable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.check_grad(['X'], 'Out', check_eager=True) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.disable_static() class TestAssignOpWithLoDTensorArray(unittest.TestCase): def test_assign_LoDTensorArray(self): paddle.enable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) main_program = Program() startup_program = Program() with program_guard(main_program): @@ -92,7 +83,6 @@ class TestAssignOpWithLoDTensorArray(unittest.TestCase): sums = paddle.tensor.array_read(array=init_array, i=i) mean = paddle.mean(sums) append_backward(mean) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) place = ( fluid.CUDAPlace(0) @@ -207,12 +197,13 @@ class TestAssignOApi(unittest.TestCase): np.testing.assert_allclose(result3.numpy(), np.array([1]), rtol=1e-05) def test_clone(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.python_api = paddle.clone x = paddle.ones([2]) x.stop_gradient = False + x.retain_grads() clone_x = paddle.clone(x) + clone_x.retain_grads() y = clone_x**3 y.backward() @@ -220,7 +211,6 @@ class TestAssignOApi(unittest.TestCase): np.testing.assert_array_equal(x, [1, 1]) np.testing.assert_array_equal(clone_x.grad.numpy(), [3, 3]) np.testing.assert_array_equal(x.grad.numpy(), [3, 3]) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.enable_static() with program_guard(Program(), Program()): @@ -241,7 +231,6 @@ class TestAssignOApi(unittest.TestCase): class TestAssignOpErrorApi(unittest.TestCase): def test_errors(self): paddle.enable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) with program_guard(Program(), Program()): # The type of input must be Variable or numpy.ndarray. x1 = fluid.create_lod_tensor( @@ -251,7 +240,6 @@ class TestAssignOpErrorApi(unittest.TestCase): # When the type of input is numpy.ndarray, the dtype of input must be float32, int32. x2 = np.array([[2.5, 2.5]], dtype='uint8') self.assertRaises(TypeError, paddle.assign, x2) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.disable_static() def test_type_error(self): @@ -281,7 +269,6 @@ class TestAssignDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [data], out, x_init=[data_arr], place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.assign_wrapper, [data], out, x_init=[data_arr], place=place ) @@ -313,7 +300,6 @@ class TestAssignTripleGradCheck(unittest.TestCase): gradient_checker.triple_grad_check( [data], out, x_init=[data_arr], place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.triple_grad_check_for_dygraph( self.assign_wrapper, [data], out, x_init=[data_arr], place=place ) diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py index 9123b4b009d1866d7ce9385777e4ddc5ffc0d5e0..675b51cf0a0535e0cc34f900477e3e2aaf8ee76a 100644 --- a/python/paddle/fluid/tests/unittests/test_case.py +++ b/python/paddle/fluid/tests/unittests/test_case.py @@ -22,6 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers import paddle.fluid.optimizer as optimizer +from paddle.fluid.backward import append_backward from paddle.fluid.framework import Program, program_guard paddle.enable_static() @@ -145,10 +146,101 @@ class TestAPICase(unittest.TestCase): ) np.testing.assert_allclose(res[0], 1, rtol=1e-05) + self.assertEqual(res[0].shape, ()) np.testing.assert_allclose(res[1], 2, rtol=1e-05) + self.assertEqual(res[1].shape, ()) np.testing.assert_allclose(res[2], 3, rtol=1e-05) + self.assertEqual(res[2].shape, ()) np.testing.assert_allclose(res[3], 2, rtol=1e-05) + self.assertEqual(res[3].shape, ()) np.testing.assert_allclose(res[4], 2, rtol=1e-05) + self.assertEqual(res[4].shape, ()) + + def test_0d_tensor_backward(self): + main_program = Program() + startup_program = Program() + with program_guard(main_program, startup_program): + x = paddle.full(shape=[], dtype='float32', fill_value=-2.0) + x.stop_gradient = False + pred = paddle.full(shape=[], dtype='bool', fill_value=0) + # pred is False, so out = -x + out = paddle.static.nn.case( + pred_fn_pairs=[(pred, lambda: x)], default=lambda: -x + ) + append_backward(out) + + place = ( + fluid.CUDAPlace(0) + if core.is_compiled_with_cuda() + else fluid.CPUPlace() + ) + exe = fluid.Executor(place) + + res = exe.run(main_program, fetch_list=[out.name, x.grad_name]) + np.testing.assert_allclose( + np.asarray(res[0]), np.array(2.0), rtol=1e-05 + ) + self.assertEqual(res[0].shape, ()) + np.testing.assert_allclose( + np.asarray(res[1]), np.array(-1.0), rtol=1e-05 + ) + self.assertEqual(res[1].shape, ()) + + def test_0d_tensor_dygraph(self): + paddle.disable_static() + + def fn_1(): + return paddle.full(shape=[], dtype='int32', fill_value=1) + + def fn_2(): + return paddle.full(shape=[], dtype='int32', fill_value=2) + + def fn_3(): + return paddle.full(shape=[], dtype='int32', fill_value=3) + + x = paddle.full(shape=[], dtype='float32', fill_value=0.3) + y = paddle.full(shape=[], dtype='float32', fill_value=0.1) + z = paddle.full(shape=[], dtype='float32', fill_value=0.2) + pred_2 = paddle.less_than(x, y) # false: 0.3 < 0.1 + pred_1 = paddle.less_than(z, x) # true: 0.2 < 0.3 + + # call fn_1 + out_0 = paddle.static.nn.control_flow.case( + pred_fn_pairs=[(pred_1, fn_1), (pred_1, fn_2)], default=fn_3 + ) + + # call fn_2 + out_1 = paddle.static.nn.control_flow.case( + pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3 + ) + + # call default fn_3 + out_2 = paddle.static.nn.control_flow.case( + pred_fn_pairs=((pred_2, fn_1), (pred_2, fn_2)), default=fn_3 + ) + + # no default, call fn_2 + out_3 = paddle.static.nn.control_flow.case( + pred_fn_pairs=[(pred_1, fn_2)] + ) + + # no default, call fn_2. but pred_2 is false + out_4 = paddle.static.nn.control_flow.case( + pred_fn_pairs=[(pred_2, fn_2)] + ) + + np.testing.assert_allclose(out_0, 1, rtol=1e-05) + self.assertEqual(out_0.shape, []) + np.testing.assert_allclose(out_1, 2, rtol=1e-05) + self.assertEqual(out_1.shape, []) + np.testing.assert_allclose(out_2, 3, rtol=1e-05) + self.assertEqual(out_2.shape, []) + np.testing.assert_allclose(out_3, 2, rtol=1e-05) + self.assertEqual(out_3.shape, []) + np.testing.assert_allclose(out_4, 2, rtol=1e-05) + self.assertEqual(out_4.shape, []) + + paddle.enable_static() def test_return_var_tuple(self): def fn_1(): @@ -394,8 +486,11 @@ class TestAPICase_Nested(unittest.TestCase): res = exe.run(main_program, fetch_list=[out_1, out_2, out_3]) np.testing.assert_allclose(res[0], 1, rtol=1e-05) + self.assertEqual(res[0].shape, ()) np.testing.assert_allclose(res[1], 2, rtol=1e-05) + self.assertEqual(res[1].shape, ()) np.testing.assert_allclose(res[2], 3, rtol=1e-05) + self.assertEqual(res[2].shape, ()) class TestAPICase_Error(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py index 9769aa8df430e70e46ae84fc3a12ea7221c1abe7..1b1c08e51e2cd16c7d85a052f21101ff1f5efd99 100644 --- a/python/paddle/fluid/tests/unittests/test_cond.py +++ b/python/paddle/fluid/tests/unittests/test_cond.py @@ -103,6 +103,7 @@ class TestCondInputOutput(unittest.TestCase): exe = fluid.Executor(place) (ret,) = exe.run(main_program, fetch_list=[out.name]) np.testing.assert_allclose(np.asarray(ret), np.array(2), rtol=1e-05) + self.assertEqual(ret.shape, ()) def test_0d_tensor_as_cond(self): """ @@ -129,7 +130,7 @@ class TestCondInputOutput(unittest.TestCase): y = paddle.full(shape=[], dtype='float32', fill_value=0.23) pred = paddle.greater_equal(y, x) out = paddle.static.nn.cond(pred, true_func, false_func) - # out is one tensor + # out is a tensor place = ( fluid.CUDAPlace(0) @@ -168,14 +169,41 @@ class TestCondInputOutput(unittest.TestCase): if core.is_compiled_with_cuda() else fluid.CPUPlace() ) + exe = fluid.Executor(place) ret = exe.run(main_program, fetch_list=[out.name, a.grad_name]) np.testing.assert_allclose( np.asarray(ret[0]), np.array(2.0), rtol=1e-05 ) + self.assertEqual(ret[0].shape, ()) np.testing.assert_allclose( np.asarray(ret[1]), np.array(-1.0), rtol=1e-05 ) + self.assertEqual(ret[1].shape, ()) + + def test_0d_tensor_dygraph(self): + """ + pseudocode: + + a = -2.0 + if a >= 0: + return a + else: + return -a + """ + paddle.disable_static() + a = paddle.full(shape=[], dtype='float32', fill_value=-2.0) + a.stop_gradient = False + out = paddle.static.nn.cond(a >= 0, lambda: a, lambda: -a) + out.backward() + + np.testing.assert_allclose(np.asarray(out), np.array(2.0), rtol=1e-05) + self.assertEqual(out.shape, []) + + np.testing.assert_allclose( + np.asarray(a.grad), np.array(-1.0), rtol=1e-05 + ) + self.assertEqual(a.grad.shape, []) def test_return_var_tuple(self): """ @@ -527,9 +555,11 @@ class TestCondNestedControlFlow(unittest.TestCase): np.testing.assert_allclose( np.asarray(ret[0]), np.array(7.0), rtol=1e-05 ) + self.assertEqual(ret[0].shape, ()) np.testing.assert_allclose( np.asarray(ret[1]), np.array(2.0), rtol=1e-05 ) + self.assertEqual(ret[1].shape, ()) def test_cond_op_in_condition(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py index d8ba91bad7b8a530fe3137b00b670f5e0878cf04..edfa7665882ca6e8180a5302114b2b5972585f9b 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py @@ -18,18 +18,16 @@ import shutil import unittest import numpy as np -from simple_nets import simple_fc_net_with_inputs import paddle from paddle.device.cuda.graphs import CUDAGraph -from paddle.fluid.dygraph.base import switch_to_static_graph def can_use_cuda_graph(): return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm() -class TestCUDAGraph(unittest.TestCase): +class TestCUDAGraphInDygraphMode(unittest.TestCase): def setUp(self): if can_use_cuda_graph(): paddle.set_flags( @@ -46,94 +44,6 @@ class TestCUDAGraph(unittest.TestCase): np.random.randint(low=0, high=10, size=shape).astype("float32") ) - @switch_to_static_graph - def test_cuda_graph_static_graph(self): - if not can_use_cuda_graph(): - return - - seed = 100 - loss_cuda_graph = self.cuda_graph_static_graph_main( - seed, use_cuda_graph=True - ) - loss_no_cuda_graph = self.cuda_graph_static_graph_main( - seed, use_cuda_graph=False - ) - self.assertEqual(loss_cuda_graph, loss_no_cuda_graph) - - def cuda_graph_static_graph_main(self, seed, use_cuda_graph): - batch_size = 1 - class_num = 10 - image_shape = [batch_size, 784] - label_shape = [batch_size, 1] - - paddle.seed(seed) - np.random.seed(seed) - startup = paddle.static.Program() - main = paddle.static.Program() - with paddle.static.program_guard(main, startup): - image = paddle.static.data( - name="image", shape=image_shape, dtype='float32' - ) - label = paddle.static.data( - name="label", shape=label_shape, dtype='int64' - ) - image.persistable = True - label.persistable = True - loss = simple_fc_net_with_inputs(image, label, class_num) - loss.persistable = True - lr = paddle.optimizer.lr.PiecewiseDecay( - boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04] - ) - optimizer = paddle.optimizer.SGD(learning_rate=lr) - optimizer.minimize(loss) - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - scope = paddle.static.Scope() - with paddle.static.scope_guard(scope): - exe.run(startup) - build_strategy = paddle.static.BuildStrategy() - build_strategy.allow_cuda_graph_capture = True - build_strategy.fix_op_run_order = True - build_strategy.fuse_all_optimizer_ops = True - compiled_program = paddle.static.CompiledProgram( - main - ).with_data_parallel( - loss_name=loss.name, build_strategy=build_strategy, places=place - ) - image_t = scope.var(image.name).get_tensor() - label_t = scope.var(label.name).get_tensor() - loss_t = scope.var(loss.name).get_tensor() - lr_var = main.global_block().var(lr._var_name) - self.assertTrue(lr_var.persistable) - lr_t = scope.var(lr_var.name).get_tensor() - cuda_graph = None - for batch_id in range(20): - image_t.set( - np.random.rand(*image_shape).astype('float32'), place - ) - label_t.set( - np.random.randint( - low=0, high=class_num, size=label_shape, dtype='int64' - ), - place, - ) - - if batch_id == 1 and use_cuda_graph: - cuda_graph = CUDAGraph(place, mode="global") - cuda_graph.capture_begin() - exe.run(compiled_program) - cuda_graph.capture_end() - - if cuda_graph: - lr_t.set(np.array([lr()], dtype='float32'), place) - cuda_graph.replay() - else: - exe.run(compiled_program) - lr.step() - if cuda_graph: - cuda_graph.reset() - return np.array(loss_t) - def test_cuda_graph_dynamic_graph(self): if not can_use_cuda_graph(): return diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..e159334c87a6492e50d51d057c6c5ab8513a9e96 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py @@ -0,0 +1,139 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from simple_nets import simple_fc_net_with_inputs + +import paddle +from paddle.device.cuda.graphs import CUDAGraph +from paddle.fluid.dygraph.base import switch_to_static_graph + + +def can_use_cuda_graph(): + return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm() + + +class TestCUDAGraphInStaticMode(unittest.TestCase): + def setUp(self): + if can_use_cuda_graph(): + # The behavior of `FLAGS_use_stream_safe_cuda_allocator` in static + # mode is inconsistent with that in dygraph mode. + # In static mode, FLAGS_use_stream_safe_cuda_allocator must be True. + # In dygraph mode, FLAGS_use_stream_safe_cuda_allocator must be False. + # These two types of unittests need to be written separately, because + # the allocator may only be initialized once, and the flag + # `FLAGS_use_stream_safe_cuda_allocator` only takes effect during + # initialization. + paddle.set_flags( + { + 'FLAGS_allocator_strategy': 'auto_growth', + 'FLAGS_sync_nccl_allreduce': False, + 'FLAGS_cudnn_deterministic': True, + 'FLAGS_use_stream_safe_cuda_allocator': True, + } + ) + + @switch_to_static_graph + def test_cuda_graph_static_graph(self): + if not can_use_cuda_graph(): + return + + seed = 100 + loss_cuda_graph = self.cuda_graph_static_graph_main( + seed, use_cuda_graph=True + ) + loss_no_cuda_graph = self.cuda_graph_static_graph_main( + seed, use_cuda_graph=False + ) + self.assertEqual(loss_cuda_graph, loss_no_cuda_graph) + + def cuda_graph_static_graph_main(self, seed, use_cuda_graph): + batch_size = 1 + class_num = 10 + image_shape = [batch_size, 784] + label_shape = [batch_size, 1] + + paddle.seed(seed) + np.random.seed(seed) + startup = paddle.static.Program() + main = paddle.static.Program() + with paddle.static.program_guard(main, startup): + image = paddle.static.data( + name="image", shape=image_shape, dtype='float32' + ) + label = paddle.static.data( + name="label", shape=label_shape, dtype='int64' + ) + image.persistable = True + label.persistable = True + loss = simple_fc_net_with_inputs(image, label, class_num) + loss.persistable = True + lr = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04] + ) + optimizer = paddle.optimizer.SGD(learning_rate=lr) + optimizer.minimize(loss) + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): + exe.run(startup) + build_strategy = paddle.static.BuildStrategy() + build_strategy.allow_cuda_graph_capture = True + build_strategy.fix_op_run_order = True + build_strategy.fuse_all_optimizer_ops = True + compiled_program = paddle.static.CompiledProgram( + main + ).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy, places=place + ) + image_t = scope.var(image.name).get_tensor() + label_t = scope.var(label.name).get_tensor() + loss_t = scope.var(loss.name).get_tensor() + lr_var = main.global_block().var(lr._var_name) + self.assertTrue(lr_var.persistable) + lr_t = scope.var(lr_var.name).get_tensor() + cuda_graph = None + for batch_id in range(20): + image_t.set( + np.random.rand(*image_shape).astype('float32'), place + ) + label_t.set( + np.random.randint( + low=0, high=class_num, size=label_shape, dtype='int64' + ), + place, + ) + + if batch_id == 1 and use_cuda_graph: + cuda_graph = CUDAGraph(place, mode="global") + cuda_graph.capture_begin() + exe.run(compiled_program) + cuda_graph.capture_end() + + if cuda_graph: + lr_t.set(np.array([lr()], dtype='float32'), place) + cuda_graph.replay() + else: + exe.run(compiled_program) + lr.step() + if cuda_graph: + cuda_graph.reset() + return np.array(loss_t) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py index 4b0cae035b06dd2c5552f84bfe136a311c1a3055..3e21537cdecf01840980172dd1b502b168fb12b9 100644 --- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py +++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py @@ -16,15 +16,12 @@ import os import tempfile import unittest -import gradient_checker import numpy as np -from decorator_helper import prog_scope from op_test import OpTest import paddle import paddle.fluid as fluid import paddle.fluid.core as core -import paddle.fluid.layers as layers import paddle.inference as paddle_infer @@ -230,7 +227,7 @@ class TestSumOpExclusive1(OpTest): def setUp(self): self.op_type = "cumsum" self.attrs = {'axis': 2, "exclusive": True} - a = np.random.random((4, 5, 65)).astype("float64") + a = np.random.random((4, 5, 20)).astype("float64") self.inputs = {'X': a} self.outputs = { 'Out': np.concatenate( @@ -245,12 +242,15 @@ class TestSumOpExclusive1(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class TestSumOpExclusive2(OpTest): def setUp(self): self.op_type = "cumsum" self.attrs = {'axis': 2, "exclusive": True} - a = np.random.random((1, 1, 888)).astype("float64") + a = np.random.random((1, 1, 100)).astype("float64") self.inputs = {'X': a} self.outputs = { 'Out': np.concatenate( @@ -265,12 +265,15 @@ class TestSumOpExclusive2(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class TestSumOpExclusive3(OpTest): def setUp(self): self.op_type = "cumsum" self.attrs = {'axis': 2, "exclusive": True} - a = np.random.random((4, 5, 888)).astype("float32") + a = np.random.random((4, 5, 20)).astype("float64") self.inputs = {'X': a} self.outputs = { 'Out': np.concatenate( @@ -285,12 +288,15 @@ class TestSumOpExclusive3(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class TestSumOpExclusive4(OpTest): def setUp(self): self.op_type = "cumsum" self.attrs = {'axis': 2, "exclusive": True} - a = np.random.random((1, 1, 3049)).astype("float64") + a = np.random.random((1, 1, 100)).astype("float64") self.inputs = {'X': a} self.outputs = { 'Out': np.concatenate( @@ -305,12 +311,15 @@ class TestSumOpExclusive4(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class TestSumOpExclusive5(OpTest): def setUp(self): self.op_type = "cumsum" self.attrs = {'axis': 2, "exclusive": True} - a = np.random.random((4, 5, 3096)).astype("float64") + a = np.random.random((4, 5, 40)).astype("float64") self.inputs = {'X': a} self.outputs = { 'Out': np.concatenate( @@ -325,12 +334,15 @@ class TestSumOpExclusive5(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class TestSumOpExclusiveFP16(OpTest): def setUp(self): self.op_type = "cumsum" self.attrs = {'axis': 2, "exclusive": True, "dtype": "float16"} - a = np.random.random((4, 5, 3096)).astype("float64") + a = np.random.random((4, 5, 20)).astype("float64") self.inputs = {'X': a} self.outputs = { 'Out': np.concatenate( @@ -345,6 +357,9 @@ class TestSumOpExclusiveFP16(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class TestSumOpReverseExclusive(OpTest): def setUp(self): @@ -366,6 +381,9 @@ class TestSumOpReverseExclusive(OpTest): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X'], 'Out') + class BadInputTest(unittest.TestCase): def test_error(self): @@ -407,7 +425,6 @@ class TestTensorAxis(unittest.TestCase): with paddle.static.program_guard(main_prog, starup_prog): # run static x = paddle.static.data(shape=np_x.shape, name='x', dtype=np_x.dtype) - print(x) linear = paddle.nn.Linear(np_x.shape[-1], np_x.shape[-1]) linear_out = linear(x) relu_out = paddle.nn.functional.relu(linear_out) @@ -444,67 +461,5 @@ class TestTensorAxis(unittest.TestCase): np.testing.assert_allclose(static_out[0], infer_out) -class TestCumsumDoubleGradCheck(unittest.TestCase): - def cumsum_wrapper(self, x): - return paddle.cumsum(x[0], 0) - - @prog_scope() - def func(self, place): - # the shape of input variable should be clearly specified, not inlcude -1. - eps = 0.005 - dtype = np.float64 - - data = layers.data('data', [3, 4], False, dtype) - data.persistable = True - out = paddle.cumsum(data, 0) - data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype) - - gradient_checker.double_grad_check( - [data], out, x_init=[data_arr], place=place, eps=eps - ) - gradient_checker.double_grad_check_for_dygraph( - self.cumsum_wrapper, [data], out, x_init=[data_arr], place=place - ) - - def test_grad(self): - paddle.enable_static() - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for p in places: - self.func(p) - - -class TestCumsumTripleGradCheck(unittest.TestCase): - def cumsum_wrapper(self, x): - return paddle.cumsum(x[0], 0) - - @prog_scope() - def func(self, place): - # the shape of input variable should be clearly specified, not inlcude -1. - eps = 0.005 - dtype = np.float32 - - data = layers.data('data', [2, 3], False, dtype) - data.persistable = True - out = paddle.cumsum(data, 0) - data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype) - - gradient_checker.triple_grad_check( - [data], out, x_init=[data_arr], place=place, eps=eps - ) - gradient_checker.triple_grad_check_for_dygraph( - self.cumsum_wrapper, [data], out, x_init=[data_arr], place=place - ) - - def test_grad(self): - paddle.enable_static() - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for p in places: - self.func(p) - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py index f1a790d8e89c11e9b7ff0400196e5a9a160f7e0e..5d0b06d02f80cf5513bd6a3f47bf2b9148450145 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py @@ -436,7 +436,7 @@ class TestFleetBase(unittest.TestCase): ) if tr0_ret != 0 or tr1_ret != 0: - if is_listen_failed(ps0_err) or is_listen_failed(ps1_err): + if is_listen_failed(ps0_err_log) or is_listen_failed(ps1_err_log): print("find parameter server port bind failed, skip the error") tr0_ret, tr1_ret = 0, 0 else: diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py index 71283aceaa4030cc79751677dba9fd98dc680d3a..dc52df4226a29733def185d9afa080a115191aac 100644 --- a/python/paddle/fluid/tests/unittests/test_histogram_op.py +++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py @@ -153,6 +153,14 @@ class TestHistogramOp(OpTest): self.check_output(check_eager=True) +class TestHistogramOp_ZeroDim(TestHistogramOp): + def init_test_case(self): + self.in_shape = [] + self.bins = 5 + self.min = 1 + self.max = 5 + + if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index 3f1283864dfbe047fabe9fb5cfd08b23bdb64f7a..63322b3f6d868f1f89557be4e4b38bc4b838df69 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -166,6 +166,7 @@ class TestDygraphTripleGrad(TestCase): @dygraph_guard def func_example_with_gradient_and_create_graph(self): x = random_var(self.shape) + x.retain_grads() x_np = x.numpy() x.stop_gradient = False @@ -222,10 +223,8 @@ class TestDygraphTripleGrad(TestCase): np.testing.assert_allclose(dddx_grad_actual, dddx_expected, rtol=1e-05) def test_all_cases(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.func_exception() self.func_example_with_gradient_and_create_graph() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) class TestDygraphTripleGradBradcastCase(TestCase): @@ -259,6 +258,7 @@ class TestDygraphTripleGradBradcastCase(TestCase): @dygraph_guard def func_example_with_gradient_and_create_graph(self): x = random_var(self.x_shape) + x.retain_grads() x_np = x.numpy() x.stop_gradient = False @@ -316,9 +316,7 @@ class TestDygraphTripleGradBradcastCase(TestCase): np.testing.assert_allclose(dddx_grad_actual, dddx_expected, rtol=1e-05) def test_all_cases(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) self.func_example_with_gradient_and_create_graph() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) # d_ddout is none, dtype is float32 diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py index eff73a4548f0eec2c67661a3332611ea25e9ab58..54356f4f8e999d357f10eab994b50da54fe5283b 100644 --- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py +++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py @@ -72,7 +72,6 @@ class TestVariable(unittest.TestCase): np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_trace_backward(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) with fluid.dygraph.guard(): a = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) b = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) @@ -80,8 +79,11 @@ class TestVariable(unittest.TestCase): y = fluid.dygraph.to_variable(b) x.stop_gradient = False y.stop_gradient = False + x.retain_grads() + y.retain_grads() loss = _legacy_C_ops.elementwise_mul(x, y) + loss.retain_grads() loss.backward() x_grad = x.gradient() @@ -89,7 +91,6 @@ class TestVariable(unittest.TestCase): np.testing.assert_array_equal(x_grad, loss.gradient() * b) np.testing.assert_array_equal(y_grad, loss.gradient() * a) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 887ce9ff3f7411bb01115e8d648b53be0ec7de31..0fb23bf73d58e11e65c99c57bf9dec05a9b5838c 100755 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -505,17 +505,18 @@ class TestReshapeZeroTensor(unittest.TestCase): class TestReshapeAPI_ZeroDim(unittest.TestCase): def test_dygraph(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) x = paddle.rand([]) x.stop_gradient = False out = paddle.reshape(x, [1]) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, []) self.assertEqual(out.shape, [1]) self.assertEqual(out.grad.shape, [1]) out = paddle.reshape(x, [-1, 1]) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, []) self.assertEqual(out.shape, [1, 1]) @@ -524,6 +525,7 @@ class TestReshapeAPI_ZeroDim(unittest.TestCase): x = paddle.rand([1]) x.stop_gradient = False out = paddle.reshape(x, []) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, [1]) self.assertEqual(out.shape, []) diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py index 7538fffb808ef17caf10096cb1dde41ed2fd1ef8..35686f843dec6f84ba4e698318404dfb40a99e94 100644 --- a/python/paddle/fluid/tests/unittests/test_run_program_op.py +++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py @@ -26,7 +26,7 @@ from paddle.fluid.executor import ( _is_dy2st_enable_standalone_executor, _is_enable_standalone_executor, ) -from paddle.fluid.framework import _in_eager_mode_ +from paddle.fluid.framework import global_var from paddle.fluid.layers.utils import _hash_with_id paddle.enable_static() @@ -177,7 +177,7 @@ class RunProgramOpTest(unittest.TestCase): def prepare_dygraph_input(self, place, return_param_list=False): def create_var_base(is_input, name, np_value, stop_gradient): - if _in_eager_mode_: + if global_var._in_eager_mode_: var = core.eager.Tensor( value=np_value, name=name, place=place, zero_copy=True ) @@ -218,7 +218,7 @@ class RunProgramOpTest(unittest.TestCase): for name in self.output_names['Out']: outputs['Out'].append(create_var_base(False, name)) - if _in_eager_mode_: + if global_var._in_eager_mode_: outputs['OutScope'] = [core.Scope()] else: outputs['OutScope'] = framework._varbase_creator( diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index af76b09047bce619ae6bfadd7dcdccf5ee807f6a..12838b218b43eadb6c2d8e45cfde4c7094a33a2d 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -895,7 +895,6 @@ class TestSliceDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [data], out, x_init=[data_arr], place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.slice_wrapper, [data], out, x_init=[data_arr], place=place ) @@ -931,7 +930,6 @@ class TestSliceTripleGradCheck(unittest.TestCase): gradient_checker.triple_grad_check( [data], out, x_init=[data_arr], place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.triple_grad_check_for_dygraph( self.slice_wrapper, [data], out, x_init=[data_arr], place=place ) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py index e2a98b170e91ca013a1756b82460ec05bd53f826..96ea87dd1b9e19559655e5ef0970f5b2fe38153a 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py @@ -43,7 +43,6 @@ class TestSparseElementWiseAPI(unittest.TestCase): """ def setUp(self): - paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) np.random.seed(2022) self.op_list = op_list self.csr_shape = [128, 256] @@ -109,7 +108,9 @@ class TestSparseElementWiseAPI(unittest.TestCase): y, dtype=dtype, stop_gradient=False ) coo_x = s_dense_x.to_sparse_coo(sparse_dim) + coo_x.retain_grads() coo_y = s_dense_y.to_sparse_coo(sparse_dim) + coo_y.retain_grads() actual_res = get_actual_res(coo_x, coo_y, op) actual_res.backward(actual_res) @@ -157,9 +158,12 @@ class TestSparseElementWiseAPI(unittest.TestCase): sp_a = sparse.sparse_coo_tensor( indices_data, values1_data, shape, stop_gradient=False ) + sp_a.retain_grads() + sp_b = sparse.sparse_coo_tensor( indices_data, values2_data, shape, stop_gradient=False ) + sp_b.retain_grads() values1 = paddle.to_tensor(values1_data, stop_gradient=False) values2 = paddle.to_tensor(values2_data, stop_gradient=False) @@ -185,6 +189,7 @@ class TestSparseElementWiseAPI(unittest.TestCase): sp_a = sparse.sparse_coo_tensor( indices_data, values_data, shape, stop_gradient=False ) + sp_a.retain_grads() bias_values = [1.0, 2.0] diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index 5cd625770ba7ec0061864ab82a72b7e2f88379d0..5d6d83d6586e74037cc327d4694fed2029f9f0b8 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -17,7 +17,6 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid import paddle.fluid.core as core devices = ['cpu', 'gpu'] @@ -148,7 +147,6 @@ class TestSparseConvert(unittest.TestCase): assert np.array_equal(dense_x.grad.numpy(), out_grad.to_dense().numpy()) def test_coo_to_dense(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] values = [1.0, 2.0, 3.0, 4.0, 5.0] indices_dtypes = ['int32', 'int64'] @@ -159,6 +157,7 @@ class TestSparseConvert(unittest.TestCase): shape=[3, 4], stop_gradient=False, ) + sparse_x.retain_grads() dense_tensor = sparse_x.to_dense() # test to_dense_grad backward out_grad = [ @@ -180,12 +179,12 @@ class TestSparseConvert(unittest.TestCase): shape=[3, 4], stop_gradient=False, ) + sparse_x_cpu.retain_grads() dense_tensor_cpu = sparse_x_cpu.to_dense() dense_tensor_cpu.backward(paddle.to_tensor(out_grad)) assert np.array_equal( correct_x_grad, sparse_x_cpu.grad.values().numpy() ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_to_sparse_csr(self): x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] @@ -202,7 +201,6 @@ class TestSparseConvert(unittest.TestCase): assert np.array_equal(dense_tensor.numpy(), x) def test_coo_values_grad(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] values = [1.0, 2.0, 3.0, 4.0, 5.0] sparse_x = paddle.sparse.sparse_coo_tensor( @@ -211,6 +209,7 @@ class TestSparseConvert(unittest.TestCase): shape=[3, 4], stop_gradient=False, ) + sparse_x.retain_grads() values_tensor = sparse_x.values() out_grad = [2.0, 3.0, 5.0, 8.0, 9.0] # test coo_values_grad @@ -230,6 +229,7 @@ class TestSparseConvert(unittest.TestCase): shape=[3, 4, 2], stop_gradient=False, ) + sparse_x.retain_grads() values_tensor = sparse_x.values() out_grad = [ [2.0, 2.0], @@ -241,7 +241,6 @@ class TestSparseConvert(unittest.TestCase): # test coo_values_grad values_tensor.backward(paddle.to_tensor(out_grad)) assert np.array_equal(out_grad, sparse_x.grad.values().numpy()) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_sparse_coo_tensor_grad(self): for device in devices: diff --git a/python/paddle/fluid/tests/unittests/test_switch_case.py b/python/paddle/fluid/tests/unittests/test_switch_case.py index 2ddbd0f7ff051e246d3e6e3bb54cc2bddab25e4a..3fad3bdfd0c0db108fe20764d28d59766a91755c 100644 --- a/python/paddle/fluid/tests/unittests/test_switch_case.py +++ b/python/paddle/fluid/tests/unittests/test_switch_case.py @@ -21,6 +21,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers +from paddle.fluid.backward import append_backward from paddle.fluid.framework import Program, program_guard paddle.enable_static() @@ -93,25 +94,25 @@ class TestAPISwitchCase(unittest.TestCase): res[1], 2, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 2), + err_msg='result is {} but answer is {}'.format(res[1], 2), ) np.testing.assert_allclose( res[2], 3, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 3), + err_msg='result is {} but answer is {}'.format(res[2], 3), ) np.testing.assert_allclose( res[3], 2, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 2), + err_msg='result is {} but answer is {}'.format(res[3], 2), ) np.testing.assert_allclose( res[4], 2, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 2), + err_msg='result is {} but answer is {}'.format(res[4], 2), ) def test_0d_tensor(self): @@ -176,30 +177,148 @@ class TestAPISwitchCase(unittest.TestCase): rtol=1e-05, err_msg='result is {} but answer is {}'.format(res[0], 1), ) + self.assertEqual(res[0].shape, ()) np.testing.assert_allclose( res[1], 2, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 2), + err_msg='result is {} but answer is {}'.format(res[1], 2), ) + self.assertEqual(res[1].shape, ()) np.testing.assert_allclose( res[2], 3, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 3), + err_msg='result is {} but answer is {}'.format(res[2], 3), ) + self.assertEqual(res[2].shape, ()) np.testing.assert_allclose( res[3], 2, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 2), + err_msg='result is {} but answer is {}'.format(res[3], 2), ) + self.assertEqual(res[3].shape, ()) np.testing.assert_allclose( res[4], 2, rtol=1e-05, - err_msg='result is {} but answer is {}'.format(res[0], 2), + err_msg='result is {} but answer is {}'.format(res[4], 2), ) + self.assertEqual(res[4].shape, ()) + + def test_0d_tensor_backward(self): + main_program = Program() + startup_program = Program() + with program_guard(main_program, startup_program): + x = paddle.full(shape=[], dtype='float32', fill_value=-2.0) + x.stop_gradient = False + pred = paddle.full(shape=[], dtype='int32', fill_value=2) + # pred is 2, so out = 2 * x + out = paddle.static.nn.switch_case( + branch_index=pred, + branch_fns=[(1, lambda: x), (2, lambda: 2 * x)], + default=lambda: -x, + ) + append_backward(out) + + place = ( + fluid.CUDAPlace(0) + if core.is_compiled_with_cuda() + else fluid.CPUPlace() + ) + exe = fluid.Executor(place) + + res = exe.run(main_program, fetch_list=[out.name, x.grad_name]) + np.testing.assert_allclose( + np.asarray(res[0]), np.array(-4.0), rtol=1e-05 + ) + self.assertEqual(res[0].shape, ()) + np.testing.assert_allclose( + np.asarray(res[1]), np.array(2.0), rtol=1e-05 + ) + self.assertEqual(res[1].shape, ()) + + def test_0d_tensor_dygraph(self): + paddle.disable_static() + + def fn_1(): + return paddle.full(shape=[], dtype='int32', fill_value=1) + + def fn_2(): + return paddle.full(shape=[], dtype='int32', fill_value=2) + + def fn_3(): + return paddle.full(shape=[], dtype='int32', fill_value=3) + + index_1 = paddle.full(shape=[], dtype='int32', fill_value=1) + index_2 = paddle.full(shape=[], dtype='int32', fill_value=2) + index_5 = paddle.full(shape=[], dtype='int32', fill_value=5) + + # call fn_1 + out_0 = paddle.static.nn.switch_case( + branch_index=index_1, branch_fns={1: fn_1, 2: fn_2, 3: fn_3} + ) + + # call fn_2 : branch_fns={0: fn_1, 1:fn_2, 2:fn_3} + out_1 = paddle.static.nn.switch_case( + branch_index=index_1, branch_fns=(fn_1, fn_2, fn_3) + ) + + # call default fn_3 + out_2 = paddle.static.nn.switch_case( + branch_index=index_5, + branch_fns=((1, fn_1), (2, fn_2)), + default=fn_3, + ) + + # no default, call fn_2 + out_3 = paddle.static.nn.switch_case( + branch_index=index_2, branch_fns=[(1, fn_1), (2, fn_2)] + ) + + # no default, call fn_2 but branch_index is 5 + out_4 = paddle.static.nn.switch_case( + branch_index=index_5, + branch_fns=[(1, fn_1), (3, fn_2), (2, fn_3)], + ) + np.testing.assert_allclose( + out_0, + 1, + rtol=1e-05, + err_msg='result is {} but answer is {}'.format(out_0, 1), + ) + self.assertEqual(out_0.shape, []) + np.testing.assert_allclose( + out_1, + 2, + rtol=1e-05, + err_msg='result is {} but answer is {}'.format(out_1, 2), + ) + self.assertEqual(out_1.shape, []) + np.testing.assert_allclose( + out_2, + 3, + rtol=1e-05, + err_msg='result is {} but answer is {}'.format(out_2, 3), + ) + self.assertEqual(out_2.shape, []) + np.testing.assert_allclose( + out_3, + 2, + rtol=1e-05, + err_msg='result is {} but answer is {}'.format(out_3, 2), + ) + self.assertEqual(out_3.shape, []) + np.testing.assert_allclose( + out_4, + 2, + rtol=1e-05, + err_msg='result is {} but answer is {}'.format(out_4, 2), + ) + self.assertEqual(out_4.shape, []) + + paddle.enable_static() def test_return_var_tuple(self): def fn_1(): @@ -426,18 +545,21 @@ class TestAPISwitchCase_Nested(unittest.TestCase): rtol=1e-05, err_msg='result is {} but answer is {}'.format(res[0], 1), ) + self.assertEqual(res[0].shape, ()) np.testing.assert_allclose( res[1], 2, rtol=1e-05, err_msg='result is {} but answer is {}'.format(res[1], 2), ) + self.assertEqual(res[1].shape, ()) np.testing.assert_allclose( res[2], 3, rtol=1e-05, err_msg='result is {} but answer is {}'.format(res[2], 3), ) + self.assertEqual(res[2].shape, ()) # test TypeError and ValueError of api switch_case diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py index a83f5b8e5aa0b15731d338643db310537022305d..e4e12f4387d3264af38dd433e7f43c387a871fd8 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py @@ -22,7 +22,6 @@ import paddle.fluid as fluid class TensorFillDiagonal_Test(unittest.TestCase): def test_dim2_normal(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype( 'float32' ) @@ -44,6 +43,7 @@ class TensorFillDiagonal_Test(unittest.TestCase): x = paddle.ones((3, 3), dtype=dtype) x.stop_gradient = False y = x * 2 + y.retain_grads() y.fill_diagonal_(1, offset=0, wrap=True) loss = y.sum() loss.backward() @@ -55,10 +55,8 @@ class TensorFillDiagonal_Test(unittest.TestCase): (y.grad.numpy().astype('float32') == expected_grad).all(), True, ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_offset(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) expected_np = np.array([[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype( 'float32' ) @@ -80,6 +78,7 @@ class TensorFillDiagonal_Test(unittest.TestCase): x = paddle.ones((3, 3), dtype=dtype) x.stop_gradient = False y = x * 2 + y.retain_grads() y.fill_diagonal_(1, offset=2, wrap=True) loss = y.sum() loss.backward() @@ -91,7 +90,6 @@ class TensorFillDiagonal_Test(unittest.TestCase): (y.grad.numpy().astype('float32') == expected_grad).all(), True, ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_bool(self): expected_np = np.array( @@ -116,7 +114,6 @@ class TensorFillDiagonal_Test(unittest.TestCase): self.assertEqual((x.numpy() == expected_np).all(), True) def test_dim2_unnormal_wrap(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) expected_np = np.array( [ [1, 2, 2], @@ -154,6 +151,7 @@ class TensorFillDiagonal_Test(unittest.TestCase): x = paddle.ones((7, 3), dtype=dtype) x.stop_gradient = False y = x * 2 + y.retain_grads() y.fill_diagonal_(1, offset=0, wrap=True) loss = y.sum() loss.backward() @@ -165,10 +163,8 @@ class TensorFillDiagonal_Test(unittest.TestCase): (y.grad.numpy().astype('float32') == expected_grad).all(), True, ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_dim2_unnormal_unwrap(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) expected_np = np.array( [ [1, 2, 2], @@ -206,6 +202,7 @@ class TensorFillDiagonal_Test(unittest.TestCase): x = paddle.ones((7, 3), dtype=dtype) x.stop_gradient = False y = x * 2 + y.retain_grads() y.fill_diagonal_(1, offset=0, wrap=False) loss = y.sum() loss.backward() @@ -217,10 +214,8 @@ class TensorFillDiagonal_Test(unittest.TestCase): (y.grad.numpy().astype('float32') == expected_grad).all(), True, ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_dim_larger2_normal(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) expected_np = np.array( [ [[1, 2, 2], [2, 2, 2], [2, 2, 2]], @@ -250,6 +245,7 @@ class TensorFillDiagonal_Test(unittest.TestCase): x = paddle.ones((3, 3, 3), dtype=dtype) x.stop_gradient = False y = x * 2 + y.retain_grads() y.fill_diagonal_(1, offset=0, wrap=True) loss = y.sum() loss.backward() @@ -261,7 +257,6 @@ class TensorFillDiagonal_Test(unittest.TestCase): (y.grad.numpy().astype('float32') == expected_grad).all(), True, ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py index 5e2756a8d24248ae6b78b0e52934ed6e718d0c43..419c142b6df5ff54909d62e649471741b48666bd 100644 --- a/python/paddle/fluid/tests/unittests/test_tile_op.py +++ b/python/paddle/fluid/tests/unittests/test_tile_op.py @@ -286,7 +286,6 @@ class TestTileDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [data], out, x_init=[data_arr], place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.tile_wrapper, [data], out, x_init=[data_arr], place=place ) @@ -318,7 +317,6 @@ class TestTileTripleGradCheck(unittest.TestCase): gradient_checker.triple_grad_check( [data], out, x_init=[data_arr], place=place, eps=eps ) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.triple_grad_check_for_dygraph( self.tile_wrapper, [data], out, x_init=[data_arr], place=place ) @@ -335,24 +333,26 @@ class TestTileTripleGradCheck(unittest.TestCase): class TestTileAPI_ZeroDim(unittest.TestCase): def test_dygraph(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) x = paddle.rand([]) x.stop_gradient = False out = paddle.tile(x, []) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) self.assertEqual(x.grad.shape, []) self.assertEqual(out.grad.shape, []) out = paddle.tile(x, [3]) + out.retain_grads() out.backward() self.assertEqual(out.shape, [3]) self.assertEqual(x.grad.shape, []) self.assertEqual(out.grad.shape, [3]) out = paddle.tile(x, [2, 3]) + out.retain_grads() out.backward() self.assertEqual(out.shape, [2, 3]) self.assertEqual(x.grad.shape, []) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index c99e7e0712929eb7d14624406423ac39df62cbf9..e18c0bec99be0e08ffca8457f85611df12b2c00a 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -21,8 +21,6 @@ import paddle import paddle.fluid as fluid import paddle.nn.functional as F -fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - unary_api_list = [ paddle.nn.functional.elu, paddle.nn.functional.gelu, @@ -102,7 +100,9 @@ class TestUnaryAPI(unittest.TestCase): for api in unary_api_list: x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = api(x) + out.retain_grads() out.backward() self.assertEqual(x.shape, []) @@ -202,7 +202,9 @@ class TestReduceAPI(unittest.TestCase): else: x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = api(x, None) + out.retain_grads() out.backward() self.assertEqual(x.shape, []) @@ -291,12 +293,16 @@ class TestBinaryAPI(unittest.TestCase): y = paddle.rand([]) x.stop_gradient = False y.stop_gradient = False + x.retain_grads() + y.retain_grads() if isinstance(api, dict): out = api['func'](x, y) out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) + + out.retain_grads() out.backward() self.assertEqual(x.shape, []) @@ -312,12 +318,16 @@ class TestBinaryAPI(unittest.TestCase): y = paddle.rand([]) x.stop_gradient = False y.stop_gradient = False + x.retain_grads() + y.retain_grads() if isinstance(api, dict): out = api['func'](x, y) out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) + + out.retain_grads() out.backward() self.assertEqual(x.shape, [2, 3, 4]) @@ -331,6 +341,8 @@ class TestBinaryAPI(unittest.TestCase): # 3) x is 0D , y is ND x = paddle.rand([]) y = paddle.rand([2, 3, 4]) + x.retain_grads() + y.retain_grads() x.stop_gradient = False y.stop_gradient = False if isinstance(api, dict): @@ -339,6 +351,8 @@ class TestBinaryAPI(unittest.TestCase): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) + + out.retain_grads() out.backward() self.assertEqual(x.shape, []) @@ -352,9 +366,11 @@ class TestBinaryAPI(unittest.TestCase): # 4) x is 0D , y is scalar x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() y = 0.5 if isinstance(api, dict): out = getattr(paddle.Tensor, api['cls_method'])(x, y) + out.retain_grads() out.backward() self.assertEqual(x.shape, []) @@ -528,7 +544,9 @@ class TestSundryAPI(unittest.TestCase): def test_flip(self): x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = paddle.flip(x, axis=[]) + out.retain_grads() out.backward() self.assertEqual(x.shape, []) self.assertEqual(out.shape, []) @@ -618,7 +636,9 @@ class TestSundryAPI(unittest.TestCase): def test_pow_factor(self): x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = paddle.pow(x, 2.0) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -628,7 +648,9 @@ class TestSundryAPI(unittest.TestCase): def test_cast(self): x = paddle.full([], 1.0, 'float32') x.stop_gradient = False + x.retain_grads() out = paddle.cast(x, 'int32') + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -638,7 +660,9 @@ class TestSundryAPI(unittest.TestCase): def test_cumprod(self): x = paddle.full([], 1.0, 'float32') x.stop_gradient = False + x.retain_grads() out = paddle.cumprod(x, 0) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -651,7 +675,9 @@ class TestSundryAPI(unittest.TestCase): def test_clip(self): x = paddle.uniform([], None, -10, 10) x.stop_gradient = False + x.retain_grads() out = paddle.clip(x, -5, 5) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -661,7 +687,9 @@ class TestSundryAPI(unittest.TestCase): def test_increment(self): x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = paddle.increment(x, 1.0) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -694,8 +722,10 @@ class TestSundryAPI(unittest.TestCase): def test_gather_1D(self): x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False) + x.retain_grads() index = paddle.full([], 2, 'int64') out = paddle.gather(x, index) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -707,8 +737,10 @@ class TestSundryAPI(unittest.TestCase): x = paddle.to_tensor( [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False ) + x.retain_grads() index = paddle.full([], 1, 'int64') out = paddle.gather(x, index) + out.retain_grads() out.backward() self.assertEqual(out.shape, [3]) @@ -720,8 +752,10 @@ class TestSundryAPI(unittest.TestCase): x = paddle.to_tensor( [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False ) + x.retain_grads() index = paddle.full([], 1, 'int64') out = paddle.gather(x, index, axis=1) + out.retain_grads() out.backward() self.assertEqual(out.shape, [2]) @@ -731,9 +765,11 @@ class TestSundryAPI(unittest.TestCase): def test_scatter_1D(self): x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False) + x.retain_grads() index = paddle.full([], 2, 'int64') updates = paddle.full([], 4.0) out = paddle.scatter(x, index, updates) + out.retain_grads() out.backward() self.assertEqual(out.shape, [5]) @@ -747,6 +783,7 @@ class TestSundryAPI(unittest.TestCase): index = paddle.full([], 1, 'int64') updates = paddle.to_tensor([1.0, 2.0, 3.0]) out = paddle.scatter(x, index, updates) + out.retain_grads() out.backward() self.assertEqual(out.shape, [2, 3]) @@ -762,10 +799,18 @@ class TestSundryAPI(unittest.TestCase): x2.stop_gradient = False x3.stop_gradient = False + x1.retain_grads() + x2.retain_grads() + x3.retain_grads() + out1 = paddle.diagflat(x1, 1) out2 = paddle.diagflat(x2, -1) out3 = paddle.diagflat(x3, 0) + out1.retain_grads() + out2.retain_grads() + out3.retain_grads() + out1.backward() out2.backward() out3.backward() @@ -800,8 +845,11 @@ class TestSundryAPI(unittest.TestCase): def test_scatter_nd(self): index = paddle.to_tensor([3], dtype="int64") updates = paddle.full([], 2, dtype='float32') + updates.retain_grads() updates.stop_gradient = False + out = paddle.scatter_nd(index, updates, [5]) + out.retain_grads() out.backward() self.assertEqual(out.shape, [5]) @@ -818,6 +866,7 @@ class TestSundryAPI(unittest.TestCase): x = paddle.randn(()) x.stop_gradient = False + x.retain_grads() out = paddle.kthvalue(x, 1) out[0].backward() @@ -838,6 +887,7 @@ class TestSundryAPI(unittest.TestCase): paddle.set_device(place) x = paddle.randn(()) + x.retain_grads() x.stop_gradient = False out = paddle.mode(x) @@ -854,21 +904,30 @@ class TestSundryAPI(unittest.TestCase): def test_flatten(self): x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() start_axis = 0 stop_axis = -1 out = paddle.flatten(x, start_axis=start_axis, stop_axis=stop_axis) + out.retain_grads() out.backward() self.assertEqual(out.shape, [1]) self.assertEqual(out.grad.shape, [1]) self.assertEqual(x.grad.shape, []) + def test_histogram(self): + x = paddle.rand([]) + out = paddle.histogram(x, bins=5, min=1, max=5) + self.assertEqual(out.shape, [5]) + def test_scale(self): x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = paddle.scale(x, scale=2.0, bias=1.0) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) @@ -900,6 +959,34 @@ class TestSundryAPI(unittest.TestCase): np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy()) np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1)) + def test_cumsum(self): + x1 = paddle.rand([]) + x1.stop_gradient = False + + out1 = paddle.cumsum(x1) + out2 = paddle.cumsum(x1, axis=0) + out3 = paddle.cumsum(x1, axis=-1) + + out1.retain_grads() + out2.retain_grads() + out3.retain_grads() + + out1.backward() + out2.backward() + out3.backward() + + self.assertEqual(x1.grad.shape, []) + self.assertTrue(x1.grad.numpy() == 3) + self.assertEqual(out1.shape, [1]) + self.assertEqual(out1.grad.shape, [1]) + self.assertTrue(out1.grad.numpy() == 1) + self.assertEqual(out2.shape, []) + self.assertEqual(out2.grad.shape, []) + self.assertTrue(out2.grad.numpy() == 1) + self.assertEqual(out3.shape, []) + self.assertEqual(out3.grad.shape, []) + self.assertTrue(out3.grad.numpy() == 1) + def test_add_n(self): x1 = paddle.rand([]) x1.stop_gradient = False @@ -911,6 +998,9 @@ class TestSundryAPI(unittest.TestCase): out1 = paddle.add_n(x1) out2 = paddle.add_n([x2, x3]) + out1.retain_grads() + out2.retain_grads() + out1.backward() out2.backward() @@ -928,26 +1018,31 @@ class TestSundryAPI(unittest.TestCase): def test_reshape_list(self): x = paddle.rand([]) x.stop_gradient = False + x.retain_grads() out = paddle.reshape(x, []) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, []) self.assertEqual(out.shape, []) self.assertEqual(out.grad.shape, []) out = paddle.reshape(x, [1]) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, []) self.assertEqual(out.shape, [1]) self.assertEqual(out.grad.shape, [1]) out = paddle.reshape(x, [-1]) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, []) self.assertEqual(out.shape, [1]) self.assertEqual(out.grad.shape, [1]) out = paddle.reshape(x, [-1, 1]) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, []) self.assertEqual(out.shape, [1, 1]) @@ -955,9 +1050,11 @@ class TestSundryAPI(unittest.TestCase): def test_reshape_tensor(self): x = paddle.rand([1, 1]) + x.retain_grads() x.stop_gradient = False out = paddle.reshape(x, []) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, [1, 1]) self.assertEqual(out.shape, []) @@ -965,6 +1062,7 @@ class TestSundryAPI(unittest.TestCase): new_shape = paddle.to_tensor([1, 1, 1], "int32") out = paddle.reshape(x, new_shape) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, [1, 1]) self.assertEqual(out.shape, [1, 1, 1]) @@ -972,6 +1070,7 @@ class TestSundryAPI(unittest.TestCase): new_shape = paddle.to_tensor([-1], "int32") out = paddle.reshape(x, new_shape) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, [1, 1]) self.assertEqual(out.shape, [1]) @@ -979,6 +1078,7 @@ class TestSundryAPI(unittest.TestCase): new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")] out = paddle.reshape(x, new_shape) + out.retain_grads() out.backward() self.assertEqual(x.grad.shape, [1, 1]) self.assertEqual(out.shape, [1, 1]) @@ -1019,6 +1119,7 @@ class TestSundryAPI(unittest.TestCase): x = paddle.rand([]) x.stop_gradient = False out = paddle.reverse(x, axis=[]) + out.retain_grads() out.backward() self.assertEqual(x.shape, []) self.assertEqual(out.shape, []) @@ -1029,9 +1130,14 @@ class TestSundryAPI(unittest.TestCase): x2 = paddle.rand([]) x1.stop_gradient = False x2.stop_gradient = False + x1.retain_grads() + x2.retain_grads() out1 = paddle.sort(x1, axis=-1) out2 = paddle.sort(x2, axis=0) + out1.retain_grads() + out2.retain_grads() + out1.backward() out2.backward() @@ -1051,9 +1157,15 @@ class TestSundryAPI(unittest.TestCase): x2 = paddle.rand([]) x1.stop_gradient = False x2.stop_gradient = False + x1.retain_grads() + x2.retain_grads() + out1 = paddle.argsort(x1, axis=-1) out2 = paddle.argsort(x2, axis=0) + out1.retain_grads() + out2.retain_grads() + out1.backward() out2.backward() @@ -1075,6 +1187,7 @@ class TestSundryAPI(unittest.TestCase): w0 = paddle.rand([]) x0.stop_gradient = False y0.stop_gradient = False + y0.retain_grads() out0 = paddle.lerp(x0, y0, w0) out0.backward() @@ -1089,6 +1202,8 @@ class TestSundryAPI(unittest.TestCase): w1 = paddle.rand([]) x1.stop_gradient = False y1.stop_gradient = False + x1.retain_grads() + y1.retain_grads() out1 = paddle.lerp(x1, y1, w1) out1.backward() @@ -1103,6 +1218,8 @@ class TestSundryAPI(unittest.TestCase): w2 = paddle.rand([]) x2.stop_gradient = False y2.stop_gradient = False + x2.retain_grads() + y2.retain_grads() out2 = paddle.lerp(x2, y2, w2) out2.backward() @@ -1120,6 +1237,7 @@ class TestSundryAPI(unittest.TestCase): x = paddle.randn(()) x.stop_gradient = False + x.retain_grads() out = paddle.repeat_interleave(x, 2, None) out.backward() @@ -1145,6 +1263,7 @@ class TestSundryAPI(unittest.TestCase): dtype='float32', stop_gradient=False, ) + logit.retain_grads() label = paddle.to_tensor( [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32' ) @@ -1153,6 +1272,7 @@ class TestSundryAPI(unittest.TestCase): out0 = F.sigmoid_focal_loss(logit, label, normalizer=fg_num_0) out1 = F.sigmoid_focal_loss(logit, label, normalizer=fg_num_1) + out0.retain_grads() np.testing.assert_array_equal( out0.numpy(), @@ -1168,12 +1288,22 @@ class TestSundryAPI(unittest.TestCase): y = paddle.full([], 0.6) self.assertFalse(paddle.allclose(x, y)) + def test_equalall(self): + x = paddle.full([], 0.5) + y = paddle.full([], 0.6) + out = paddle.equal_all(x, y) + self.assertEqual(out.shape, []) + self.assertFalse(out) + def test_where(self): x1 = paddle.full([], 1) x2 = paddle.full([], 2) x1.stop_gradient = False x2.stop_gradient = False + x1.retain_grads() + x2.retain_grads() out = paddle.where(x1 > x2, x1, x2) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) self.assertEqual(out.numpy(), 2) @@ -1186,9 +1316,12 @@ class TestSundryAPI(unittest.TestCase): def test_atan2(self): x1 = paddle.full([], 0) x2 = paddle.full([], 2) + x1.retain_grads() + x2.retain_grads() x1.stop_gradient = False x2.stop_gradient = False out = paddle.atan2(x1, x2) + out.retain_grads() out.backward() self.assertEqual(out.shape, []) self.assertEqual(out.numpy(), 0) @@ -1198,6 +1331,31 @@ class TestSundryAPI(unittest.TestCase): self.assertEqual(x1.grad.numpy(), 0.5) self.assertEqual(x2.grad.numpy(), 0) + def test_maseked_select(self): + x = paddle.rand([]) + x.stop_gradient = False + mask = paddle.full([], True, dtype='bool') + y = paddle.masked_select(x, mask) + + y.retain_grads() + y.backward() + self.assertEqual(y.shape, [1]) + self.assertEqual(y.numpy(), x.numpy()) + self.assertEqual(y.grad.shape, [1]) + self.assertEqual(x.grad.shape, []) + self.assertEqual(x.grad.numpy(), 1) + + def test_t(self): + x = paddle.full([], 2.0) + x.stop_gradient = False + x.retain_grads() + out = paddle.t(x) + out.retain_grads() + out.backward() + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + class TestSundryAPIStatic(unittest.TestCase): def setUp(self): @@ -1516,6 +1674,16 @@ class TestSundryAPIStatic(unittest.TestCase): self.assertEqual(res[1].shape, ()) self.assertEqual(res[2].shape, (1,)) + @prog_scope() + def test_histogram(self): + x = paddle.full([], 1, 'float32') + out = paddle.histogram(x, bins=5, min=1, max=5) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, feed={}, fetch_list=[out]) + + self.assertEqual(res[0].shape, (5,)) + @prog_scope() def test_scale(self): x = paddle.rand([]) @@ -1560,6 +1728,45 @@ class TestSundryAPIStatic(unittest.TestCase): np.testing.assert_array_equal(out3_2, np.asarray(1)) @prog_scope() + def test_cumsum(self): + x1 = paddle.rand([]) + x1.stop_gradient = False + + out1 = paddle.cumsum(x1) + out2 = paddle.cumsum(x1, axis=0) + out3 = paddle.cumsum(x1, axis=-1) + + paddle.static.append_backward(out1.sum()) + paddle.static.append_backward(out2.sum()) + paddle.static.append_backward(out3.sum()) + + prog = paddle.static.default_main_program() + res = self.exe.run( + prog, + fetch_list=[ + out1, + out2, + out3, + x1.grad_name, + out1.grad_name, + out2.grad_name, + out3.grad_name, + ], + ) + self.assertEqual(res[0].shape, (1,)) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, ()) + self.assertEqual(res[3].shape, ()) + self.assertEqual(res[3], 1) + self.assertEqual(res[4].shape, (1,)) + self.assertEqual(res[4], 1) + self.assertEqual(res[5].shape, ()) + self.assertEqual(res[5], 1) + self.assertEqual(res[6].shape, ()) + self.assertEqual(res[6], 1) + self.assertEqual(out2.shape, ()) + self.assertEqual(out3.shape, ()) + def test_add_n(self): x1 = paddle.rand([]) x1.stop_gradient = False @@ -1868,6 +2075,37 @@ class TestSundryAPIStatic(unittest.TestCase): self.assertEqual(res[0].shape, ()) + @prog_scope() + def test_maseked_select(self): + x = paddle.rand([]) + x.stop_gradient = False + mask = paddle.full([], True, dtype='bool') + y = paddle.masked_select(x, mask) + paddle.static.append_backward(y.sum()) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[x, y, y.grad_name, x.grad_name]) + self.assertEqual(res[1].shape, (1,)) + self.assertEqual(res[1], res[0]) + self.assertEqual(res[2].shape, (1,)) + self.assertEqual(res[3].shape, ()) + self.assertEqual(res[3], 1) + + @prog_scope() + def test_t(self): + x = paddle.full([], 2.0) + x.stop_gradient = False + out = paddle.t(x) + paddle.static.append_backward(out.sum()) + prog = paddle.static.default_main_program() + res = self.exe.run( + prog, feed={}, fetch_list=[out, out.grad_name, x.grad_name] + ) + + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, ()) + # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. class TestNoBackwardAPI(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index afaf3b2a52fab7551d7b9b0338b4f4cdad66cb4f..95934caf52b34040619baabc287abd201b2a6ccd 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -228,7 +228,8 @@ def get_xpu_op_support_types(op_name, dev_id=0): op_name_type = op_name + "_" + stype if op_name_type in ops: support_types.append(stype) - + if len(support_types) == 0: + print("WARNING: support_types is EMPTY for op", op_name) return support_types diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py index 6f112a76204c93506340f511ccddf061b5e5fabe..f7833199180ecc356adcd238b7d600785a0edf20 100644 --- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py +++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py @@ -53,28 +53,26 @@ class TestProcessGroupFp32(unittest.TestCase): ) sys.stdout.write("rank {}: test new group api ok\n".format(pg.rank())) + # TODO(zhangxiaoci) allreduce unittest raise error # test allreduce sum # rank 0 - x = np.random.random(self.shape).astype(self.dtype) - tensor_x = paddle.to_tensor(x) + # x = np.random.random(self.shape).astype(self.dtype) + # tensor_x = paddle.to_tensor(x) # rank 1 - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) + # y = np.random.random(self.shape).astype(self.dtype) + # tensor_y = paddle.to_tensor(y) - sum_result = tensor_x + tensor_y - if pg.rank() == 0: - task = dist.all_reduce(tensor_x) - assert np.array_equal(tensor_x, sum_result) - else: - task = dist.all_reduce(tensor_y) - assert np.array_equal(tensor_y, sum_result) - - sys.stdout.write( - "rank {}: test allreduce sum api ok\n".format(pg.rank()) - ) + # sum_result = tensor_x + tensor_y + # if pg.rank() == 0: + # task = dist.all_reduce(tensor_x) + # assert np.array_equal(tensor_x, sum_result) + # else: + # task = dist.all_reduce(tensor_y) + # assert np.array_equal(tensor_y, sum_result) - # TODO - # test allreduce max/min/prod + # sys.stdout.write( + # "rank {}: test allreduce sum api ok\n".format(pg.rank()) + # ) # test broadcast # rank 0 @@ -178,6 +176,52 @@ class TestProcessGroupFp32(unittest.TestCase): assert np.array_equal(tensor_y, old_tensor_y) sys.stdout.write("rank {}: test reduce sum api ok\n".format(pg.rank())) + # test send async api + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + if pg.rank() == 0: + task = dist.send(tensor_x, 1, sync_op=False) + task.wait() + else: + task = dist.recv(tensor_y, 0, sync_op=False) + task.wait() + assert np.array_equal(tensor_y, tensor_x) + + # test send sync api + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + if pg.rank() == 0: + task = dist.send(tensor_x, 1, sync_op=True) + else: + task = dist.recv(tensor_y, 0, sync_op=True) + assert np.array_equal(tensor_y, tensor_x) + + # test send 0-d tensor + # rank 0 + x = np.random.uniform(-1, 1, []).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.array(0.2022).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + if pg.rank() == 0: + task = dist.send(tensor_x, 1, sync_op=True) + else: + task = dist.recv(tensor_y, 0, sync_op=True) + assert np.array_equal(tensor_y, tensor_x) and tensor_y.shape == [] + + sys.stdout.write("rank {}: test send api ok\n".format(pg.rank())) + class TestProcessGroupFp16(TestProcessGroupFp32): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py index 0491e7ef5f1d5bc2798c0fce1bc29f7e4fef266e..62120c0d1be8d6155ec1c282f90fb1331b8c70ce 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py @@ -46,6 +46,11 @@ class XPUTestLogicalAnd(XPUOpTestWrapper): def set_case(self): self.op_type = 'logical_and' + # special range for bool dtype + if self.dtype == np.dtype(np.bool): + self.low = 0 + self.high = 2 + x = np.random.randint( self.low, self.high, self.x_shape, dtype=self.dtype ) @@ -62,7 +67,7 @@ class XPUTestLogicalAnd(XPUOpTestWrapper): self.outputs = {'Out': out} def init_case(self): - self.dtype = np.int32 + self.dtype = self.in_type self.x_shape = [2, 3, 4, 5] self.y_shape = [2, 3, 4, 5] self.low = -100 @@ -76,7 +81,7 @@ class XPUTestLogicalAnd(XPUOpTestWrapper): class XPUTestLogicalAndCase1(XPUTestLogicalAndBase): def init_case(self): - self.dtype = np.int32 + self.dtype = self.in_type self.x_shape = [4, 5] self.y_shape = [2, 3, 4, 5] self.low = -100 @@ -102,6 +107,11 @@ class XPUTestLogicalOr(XPUOpTestWrapper): def set_case(self): self.op_type = 'logical_or' + # special range for bool dtype + if self.dtype == np.dtype(np.bool): + self.low = 0 + self.high = 2 + x = np.random.randint( self.low, self.high, self.x_shape, dtype=self.dtype ) @@ -118,7 +128,7 @@ class XPUTestLogicalOr(XPUOpTestWrapper): self.outputs = {'Out': out} def init_case(self): - self.dtype = np.int32 + self.dtype = self.in_type self.x_shape = [2, 3, 4, 5] self.y_shape = [2, 3, 4, 5] self.low = -100 @@ -132,7 +142,7 @@ class XPUTestLogicalOr(XPUOpTestWrapper): class XPUTestLogicalOrCase1(XPUTestLogicalOrBase): def init_case(self): - self.dtype = np.int32 + self.dtype = self.in_type self.x_shape = [4, 5] self.y_shape = [2, 3, 4, 5] self.low = -100 @@ -158,6 +168,11 @@ class XPUTestLogicalXor(XPUOpTestWrapper): def set_case(self): self.op_type = 'logical_xor' + # special range for bool dtype + if self.dtype == np.dtype(np.bool): + self.low = 0 + self.high = 2 + x = np.random.randint( self.low, self.high, self.x_shape, dtype=self.dtype ) @@ -174,7 +189,7 @@ class XPUTestLogicalXor(XPUOpTestWrapper): self.outputs = {'Out': out} def init_case(self): - self.dtype = np.int64 + self.dtype = self.in_type self.x_shape = [2, 3, 4, 5] self.y_shape = [2, 3, 4, 5] self.low = -100 @@ -188,7 +203,7 @@ class XPUTestLogicalXor(XPUOpTestWrapper): class XPUTestLogicalXorCase1(XPUTestLogicalXorBase): def init_case(self): - self.dtype = np.int32 + self.dtype = self.in_type self.x_shape = [4, 5] self.y_shape = [2, 3, 4, 5] self.low = -100 @@ -214,6 +229,11 @@ class XPUTestLogicalNot(XPUOpTestWrapper): def set_case(self): self.op_type = 'logical_not' + # special range for bool dtype + if self.dtype == np.dtype(np.bool): + self.low = 0 + self.high = 2 + x = np.random.randint( self.low, self.high, self.x_shape, dtype=self.dtype ) @@ -224,7 +244,7 @@ class XPUTestLogicalNot(XPUOpTestWrapper): self.outputs = {'Out': out} def init_case(self): - self.dtype = np.int32 + self.dtype = self.in_type self.x_shape = [2, 3, 4, 5] self.low = -100 self.high = 100 diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py old mode 100644 new mode 100755 index d8ca8978843cea2e74e410548f478e5fbc7239dc..a9d95fc963ce338dd06787d13ea26514dc9b4855 --- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py @@ -477,7 +477,7 @@ class TestSundryAPI(unittest.TestCase): self.assertEqual(x.grad.shape, [2, 3]) self.assertEqual(out.grad.shape, [2]) - def test_scatter_1D(self): + def _test_scatter_1D(self): x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False) index = paddle.full([], 2, 'int64') updates = paddle.full([], 4.0) @@ -488,7 +488,7 @@ class TestSundryAPI(unittest.TestCase): self.assertEqual(out.numpy()[2], 4) self.assertEqual(out.grad.shape, [5]) - def test_scatter_XD(self): + def _test_scatter_XD(self): x = paddle.to_tensor( [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False ) @@ -592,6 +592,29 @@ class TestSundryAPI(unittest.TestCase): np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy()) np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1)) + def test_cumsum(self): + x1 = paddle.rand([]) + x1.stop_gradient = False + + out1 = paddle.cumsum(x1) + out2 = paddle.cumsum(x1, axis=0) + out3 = paddle.cumsum(x1, axis=-1) + + out1.retain_grads() + out2.retain_grads() + out3.retain_grads() + + out1.backward() + out2.backward() + out3.backward() + + self.assertEqual(out1.shape, [1]) + self.assertEqual(out1.grad.shape, [1]) + self.assertEqual(out2.shape, []) + self.assertEqual(out2.grad.shape, []) + self.assertEqual(out3.shape, []) + self.assertEqual(out3.grad.shape, []) + def test_add_n(self): x1 = paddle.rand([]) x1.stop_gradient = False @@ -780,6 +803,27 @@ class TestSundryAPI(unittest.TestCase): y = paddle.full([], 0.6) self.assertFalse(paddle.allclose(x, y)) + def test_equalall(self): + x = paddle.full([], 0.5) + y = paddle.full([], 0.6) + out = paddle.equal_all(x, y) + self.assertEqual(out.shape, []) + self.assertFalse(out) + + def test_maseked_select(self): + x = paddle.rand([]) + x.stop_gradient = False + mask = paddle.full([], True, dtype='bool') + y = paddle.masked_select(x, mask) + + y.retain_grads() + y.backward() + self.assertEqual(y.shape, [1]) + self.assertEqual(y.numpy(), x.numpy()) + self.assertEqual(y.grad.shape, [1]) + self.assertEqual(x.grad.shape, []) + self.assertEqual(x.grad.numpy(), 1) + # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py index 456ac20db2e5e1b24039403c3d515474e1ccb730..23bf8f0f7e3bff0e51eea216aac5d322099b1121 100644 --- a/python/paddle/incubate/autograd/composite_rules.py +++ b/python/paddle/incubate/autograd/composite_rules.py @@ -29,7 +29,9 @@ def _composite(op, *args): @REGISTER_COMPOSITE('softmax') def softmax_composite(x, axis): """define composite rule of op softmax""" - molecular = exp(x) - denominator = broadcast_to(sum(molecular, axis=axis, keepdim=True), x.shape) + max_temp = max(x, axis, keepdim=True) + max_temp.stop_gradient = True + molecular = exp(x - max_temp) + denominator = sum(molecular, axis=axis, keepdim=True) res = divide(molecular, denominator) return res diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py index 7cfabdd9e5551627ba6ce55e81486702fffb032d..76e0802194272927c6318bba7def02e67314cdfd 100644 --- a/python/paddle/incubate/autograd/primapi.py +++ b/python/paddle/incubate/autograd/primapi.py @@ -16,7 +16,7 @@ import logging import typing import paddle -from paddle.fluid import backward, framework +from paddle.fluid import backward, core, framework from paddle.incubate.autograd import primx, utils @@ -218,13 +218,22 @@ def grad(outputs, inputs, grad_outputs=None): @framework.static_only def to_prim(blocks): """Search nonbasic ops which have be registered composite rules and replace them with primitive ops.""" + if not core.enable_prim_forward(): + return if isinstance(blocks, paddle.fluid.framework.Block): logging.info("Atomize composite op to primitive ops begin.") - primx._lower_composite(blocks) - return + main_program = blocks.program elif isinstance(blocks, typing.Sequence): for item in blocks: - to_prim(item) - return + if not isinstance(item, paddle.fluid.framework.Block): + raise TypeError( + f"Expect block or sequence of blocks, but sequence contains {type(item)}." + ) + main_program = blocks[0].program else: - raise TypeError + raise TypeError( + f"Expect block or sequence of blocks, but got {type(blocks)}." + ) + with framework.program_guard(main_program): + primx._lower_composite(blocks) + return diff --git a/python/paddle/incubate/autograd/primitives.py b/python/paddle/incubate/autograd/primitives.py index 371746bf349c926fabf831a7dcf583f911de2eb5..a9ec324c05a7a1fdb36b4a7849689d4900208c52 100644 --- a/python/paddle/incubate/autograd/primitives.py +++ b/python/paddle/incubate/autograd/primitives.py @@ -39,6 +39,8 @@ from paddle.tensor import log1p # noqa: F401 from paddle.tensor import logcumsumexp # noqa: F401 from paddle.tensor import logit # noqa: F401 from paddle.tensor import logsumexp # noqa: F401 +from paddle.tensor import max # noqa: F401 +from paddle.tensor import min # noqa: F401 from paddle.tensor import multiply # noqa: F401 from paddle.tensor import pow # noqa: F401 from paddle.tensor import prod # noqa: F401 @@ -73,6 +75,8 @@ math_op = [ 'logsumexp', 'logcumsumexp', 'logit', + 'max', + 'min', ] trigonometric_op = [ diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py index 6f2d4d9d5213c768aad2e354c247cf5968ef6c2d..c472137ab71691e87bb1e138cf39a143055b89dc 100644 --- a/python/paddle/incubate/autograd/primx.py +++ b/python/paddle/incubate/autograd/primx.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import typing from collections import OrderedDict import paddle @@ -575,90 +577,101 @@ def _lower_composite(block, blacklist=[]): return_list.append(x) return return_list - # Step1: Do some preparatory work for lower - lower_fn = _composite - lookup_fn = lookup_composite - - value_table = {} - to_bind = {} - to_bind_rev = {} - for var in block.desc.all_vars(): - value_table[var.name()] = block.var(var.name()) - - ops_to_remove = [] - vars_to_remove = set() - - # Step2: Process all ops in the target block - for op_idx in range(len(block.ops)): - op = block.ops[op_idx] - ops_to_remove.append(op_idx) - if lookup_fn(op.type) is not None and op.type not in blacklist: - input_args = prepare_python_api_arguments(op) - bind(input_args, to_bind, value_table) - - for orig_out, new_out in zip( - expand_nested_list(get_output_var_list(op)), - expand_nested_list(as_tensors(lower_fn(op, *input_args))), - ): - assert not (orig_out is None) ^ ( - new_out is None - ), "orig_out and new_out should match." - vars_to_remove.add(new_out.name) - value_table[new_out.name] = new_out - to_bind[orig_out.name] = new_out.name - to_bind_rev[new_out.name] = orig_out.name - else: - inputs = {} - for i in range(len(op.input_names)): - inputs[op.input_names[i]] = bind_name( - op.input(op.input_names[i]), to_bind - ) - - outputs = {} - for i in range(len(op.output_names)): - outputs[op.output_names[i]] = op.output(op.output_names[i]) - - attrs = {} - for name in sorted(op.attr_names): - attrs[name] = op.attr(name) - from paddle.fluid.dygraph.base import param_guard - - new_op_desc = block.desc.append_op() - with param_guard(inputs), param_guard(outputs): - op = Operator( - block=block, - desc=new_op_desc, - type=op.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - ) - block.ops.append(op) - - # Step3: Do some post-processing work - for op_idx in reversed(ops_to_remove): - block.desc._remove_op(op_idx, op_idx + 1) - del block.ops[op_idx] - block._sync_with_cpp() - - for op_idx in range(len(block.ops)): - op = block.ops[op_idx] - for in_name in op.input_arg_names: - if in_name in to_bind_rev: - op._rename_input(in_name, to_bind_rev[in_name]) - - for out_name in op.output_arg_names: - if out_name in to_bind_rev: - op._rename_output(out_name, to_bind_rev[out_name]) + if isinstance(block, paddle.fluid.framework.Block): + logging.info("Atomize composite op to primitive ops begin.") + + # Step1: Do some preparatory work for lower + lower_fn = _composite + lookup_fn = lookup_composite + + value_table = {} + to_bind = {} + to_bind_rev = {} + for var in block.desc.all_vars(): + value_table[var.name()] = block.var(var.name()) + + ops_to_remove = [] + vars_to_remove = set() + + # Step2: Process all ops in the target block + for op_idx in range(len(block.ops)): + op = block.ops[op_idx] + ops_to_remove.append(op_idx) + if lookup_fn(op.type) is not None and op.type not in blacklist: + input_args = prepare_python_api_arguments(op) + bind(input_args, to_bind, value_table) + + for orig_out, new_out in zip( + expand_nested_list(get_output_var_list(op)), + expand_nested_list(as_tensors(lower_fn(op, *input_args))), + ): + assert not (orig_out is None) ^ ( + new_out is None + ), "orig_out and new_out should match." + vars_to_remove.add(new_out.name) + value_table[new_out.name] = new_out + to_bind[orig_out.name] = new_out.name + to_bind_rev[new_out.name] = orig_out.name + else: + inputs = {} + for i in range(len(op.input_names)): + inputs[op.input_names[i]] = bind_name( + op.input(op.input_names[i]), to_bind + ) + + outputs = {} + for i in range(len(op.output_names)): + outputs[op.output_names[i]] = op.output(op.output_names[i]) + + attrs = {} + for name in sorted(op.attr_names): + attrs[name] = op.attr(name) + from paddle.fluid.dygraph.base import param_guard + + new_op_desc = block.desc.append_op() + with param_guard(inputs), param_guard(outputs): + op = Operator( + block=block, + desc=new_op_desc, + type=op.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + ) + block.ops.append(op) + + # Step3: Do some post-processing work + for op_idx in reversed(ops_to_remove): + block.desc._remove_op(op_idx, op_idx + 1) + del block.ops[op_idx] + block._sync_with_cpp() - for var_name in sorted(vars_to_remove): - assert ( - var_name in to_bind_rev - ), 'var_name "{}" is not in to_bind_rev.'.format(var_name) - if var_name != to_bind_rev[var_name]: - block.desc._remove_var(var_name.encode()) - del block.vars[var_name] - block._sync_with_cpp() + for op_idx in range(len(block.ops)): + op = block.ops[op_idx] + for in_name in op.input_arg_names: + if in_name in to_bind_rev: + op._rename_input(in_name, to_bind_rev[in_name]) + + for out_name in op.output_arg_names: + if out_name in to_bind_rev: + op._rename_output(out_name, to_bind_rev[out_name]) + + for var_name in sorted(vars_to_remove): + assert ( + var_name in to_bind_rev + ), 'var_name "{}" is not in to_bind_rev.'.format(var_name) + if var_name != to_bind_rev[var_name]: + block.desc._remove_var(var_name.encode()) + del block.vars[var_name] + block._sync_with_cpp() + return + + elif isinstance(block, typing.Sequence): + for item in block: + _lower_composite(item) + return + else: + raise TypeError @framework.static_only diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 040ef36854f3fb9db4d0684cd861fd3febba26f0..293c8b40f7752bffa3819e459a929052780cff0e 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -32,6 +32,7 @@ from paddle.fluid.layers.utils import _hash_with_id, flatten, pack_sequence_as from . import logging_utils from .return_transformer import RETURN_NO_VALUE_MAGIC_NUM +from .utils import _out_grad_names, _param_grad_names __all__ = [] @@ -206,10 +207,6 @@ class PartialProgramLayer: else: return core.Scope() - @LazyInitialized - def __fake_vars(self): - return _create_fake_var() - @LazyInitialized def _double_grads(self): return self._get_double_grads(self._origin_main_program) @@ -379,46 +376,15 @@ class PartialProgramLayer: @LazyInitialized def _param_grad_names(self): - names = [] - # NOTE: `names` and `self._params` must be in the same order so that - # the param grad name can be set correctly in the run_program. - for param in self._params: - candidate = [ - var_name - for var_name in self._train_program.block(0).vars.keys() - if var_name.endswith(param.name + '@GRAD') - ] - if candidate: - names.append( - max(candidate, key=lambda name: name.count('grad/')) - ) - else: - names.append(param.name + '@GRAD') - return names + return _param_grad_names(self._train_program.desc, self._params) @LazyInitialized def _out_grad_names(self): - """ - Parse Out@GARD name from original train and infer program. - """ - names = [] - origin_infer_program = self._create_program(is_infer_mode=True) - origin_train_program = self._train_program - fwd_end_op_index = len(origin_infer_program.block(0).ops) - for i in range( - fwd_end_op_index + 1, - min( - fwd_end_op_index + 2 * len(self._outputs.var_ids), - len(origin_train_program.block(0).ops), - ), - 2, - ): - op = origin_train_program.block(0).ops[i] - if op.type == 'fill_constant': - var_name = op.output('Out')[0] - names.append(var_name) - - return names + return _out_grad_names( + self._train_program.desc, + self._create_program(is_infer_mode=True).desc.block(0).op_size(), + len(self._outputs.var_ids), + ) @property def program(self): @@ -604,8 +570,14 @@ class PartialProgramLayer: if isinstance(out, framework.Variable): targets.append(program.global_block().var(out.name)) - if targets and self._params: - backward.gradients(targets=targets, inputs=[]) + if targets: + enable_prim = self._build_strategy.build_cinn_pass + if enable_prim and core.enable_prim_backward(): + core.set_prim_enabled(True) + backward.gradients(targets=targets, inputs=[]) + core.set_prim_enabled(False) + else: + backward.gradients(targets=targets, inputs=[]) start_idx = len(main_program.block(0).ops) + 2 * len( self._outputs.tolist() @@ -647,7 +619,7 @@ class PartialProgramLayer: if "@GRAD" in name: var_desc = block.vars[name].desc var_base = None - if not framework._in_eager_mode_: + if not framework.global_var._in_eager_mode_: var_base = core.VarBase( var_desc.dtype(), var_desc.shape(), @@ -902,7 +874,7 @@ class PartialProgramLayer: for i, value in enumerate(flatten_inputs): if isinstance(value, np.ndarray): var = None - if not framework._in_eager_mode_: + if not framework.global_var._in_eager_mode_: var = core.VarBase( value=value, name=self._inputs[i].desc.name(), @@ -946,7 +918,7 @@ class PartialProgramLayer: if var_desc.name() in out_varbase_map: return out_varbase_map[var_desc.name()] - if not framework._in_eager_mode_: + if not framework.global_var._in_eager_mode_: var_base = core.VarBase( var_desc.dtype(), var_desc.shape(), @@ -977,7 +949,7 @@ class PartialProgramLayer: inner_scope = self._get_scope( program_id=program_id, use_scope_cache=use_scope_cache ) - if not framework._in_eager_mode_: + if not framework.global_var._in_eager_mode_: tmp_scope_vec = core.VarBase( core.VarDesc.VarType.FP32, [], @@ -1123,19 +1095,14 @@ class PartialProgramLayer: ) def _valid_vars(self, vars): - """ - Note: run_program_op.InferShape requires `X`/'Out' not be null. - But it's common in dy2static, fake varBase is created to handle the - problem. - """ - return vars if vars else self.__fake_vars + return vars if vars else None def _create_fake_var(): """ Create a fake_var (force on CPU) to handle empty input or output """ - if not framework._in_eager_mode_: + if not framework.global_var._in_eager_mode_: return [ core.VarBase( core.VarDesc.VarType.FP32, diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index 7fd6b0ce7fe004819e210bd01e158e56a27e574d..5b8493977e904b9fefdb4ae448b8df1abd499e13 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -18,7 +18,7 @@ import textwrap import threading import weakref -from paddle.fluid import _non_static_mode, framework +from paddle.fluid import _non_static_mode, core, framework from paddle.fluid.data_feeder import check_type from paddle.fluid.dygraph import layers from paddle.fluid.dygraph.base import param_guard, switch_to_static_graph @@ -930,6 +930,13 @@ class ConcreteProgram: self.function = function self.kwargs = kwargs + @switch_to_static_graph + def _to_prim(self): + # TODO(Aurelius84): Fix this cycle import problem + from paddle.incubate.autograd.primapi import to_prim + + to_prim(self.main_program.blocks) + @staticmethod @switch_to_static_graph def from_func_spec( @@ -1083,6 +1090,11 @@ class ProgramCache: self._recent_cache_key = None def _build_once(self, cache_key): + # TODO(Aurelius84): Need a gloabl FLAGS to enable/disable to_prim + enable_prim = cache_key.kwargs['build_strategy'].build_cinn_pass + if enable_prim and core.enable_prim_backward(): + core.set_prim_enabled(True) + concrete_program = ConcreteProgram.from_func_spec( func_spec=cache_key.function_spec, input_spec=cache_key.input_args_with_spec, @@ -1090,6 +1102,10 @@ class ProgramCache: class_instance=cache_key.class_instance, **cache_key.kwargs ) + + if enable_prim or core.enable_prim_forward() == "debug": + concrete_program._to_prim() + core.set_prim_enabled(False) return concrete_program, partial_program_from(concrete_program) def __getitem__(self, item): diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py index 4d74c629a392683bd63e71ef2a5806f186354daf..4397728576ba755a618f706954cda16b45d3f6aa 100644 --- a/python/paddle/jit/dy2static/utils.py +++ b/python/paddle/jit/dy2static/utils.py @@ -1483,3 +1483,41 @@ def create_name_str(name_ids): names_str = ["'%s'" % (name.replace("'", "\\'")) for name in name_ids] return "(%s, )" % ','.join(names_str) + + +def _param_grad_names(program_desc, params): + """ + Parse PARAM@GARD name from original train and infer program. + """ + names = [] + # NOTE: `names` and `self._params` must be in the same order so that + # the param grad name can be set correctly in the run_program. + for param in params: + candidate = [ + var.name() + for var in program_desc.block(0).all_vars() + if var.name().endswith(param.name + '@GRAD') + ] + if candidate: + names.append(max(candidate, key=lambda name: name.count('grad/'))) + else: + names.append(param.name + '@GRAD') + + return names + + +def _out_grad_names(program_desc, fwd_end_op_index, out_size): + """ + Parse Out@GARD name from original train and infer program. + """ + names = [] + for i in range( + fwd_end_op_index + 1, + min(fwd_end_op_index + 2 * out_size, program_desc.block(0).op_size()), + 2, + ): + op = program_desc.block(0).op(i) + if op.type() == 'fill_constant': + var_name = op.output('Out')[0] + names.append(var_name) + return names diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py index 9cd30545af8450343981aeb020f451b6c2f235bb..c488c758f4a262fdb4537f8d9a2c26d6248e4bbe 100644 --- a/python/paddle/jit/translated_layer.py +++ b/python/paddle/jit/translated_layer.py @@ -33,6 +33,8 @@ from paddle.jit.dy2static.partial_program import ( add_build_strategy_for, ) +from .dy2static.utils import _out_grad_names, _param_grad_names + __all__ = [] INFER_MODEL_SUFFIX = ".pdmodel" @@ -887,28 +889,7 @@ def _construct_params_and_buffers( def _valid_vars(vars): - if vars: - return vars - if framework._in_eager_without_dygraph_check(): - return [ - core.eager.Tensor( - core.VarDesc.VarType.FP32, - [], - "Fake_var", - core.VarDesc.VarType.RAW, - False, - ) - ] - else: - return [ - core.VarBase( - core.VarDesc.VarType.FP32, - [], - "Fake_var", - core.VarDesc.VarType.RAW, - False, - ) - ] + return vars if vars else None def _run_dygraph(instance, input, program_holder): @@ -1041,6 +1022,15 @@ def _run_dygraph(instance, input, program_holder): 'program_id', _hash_with_id(trace_program, instance), ] + if not instance._is_test: + attrs.extend( + ( + 'param_grad_names', + _param_grad_names(trace_program, persistable_vars), + 'out_grad_names', + _out_grad_names(trace_program, end_op_index, len(output_vars)), + ) + ) use_interpretorcore = ( _is_enable_standalone_executor() diff --git a/python/paddle/static/amp/__init__.py b/python/paddle/static/amp/__init__.py index 01832fd536769bff0b5613d399d35d61131cad5c..795e49698f34462b4182a69d49e4275db94c70d7 100644 --- a/python/paddle/static/amp/__init__.py +++ b/python/paddle/static/amp/__init__.py @@ -19,9 +19,3 @@ from .fp16_lists import CustomOpLists, AutoMixedPrecisionLists from . import fp16_utils from .fp16_utils import fp16_guard, cast_model_to_fp16, cast_parameters_to_fp16 from . import bf16 -from .bf16 import bf16_guard - -__all__ = [] -__all__ += decorator.__all__ -__all__ += fp16_lists.__all__ -__all__ += fp16_utils.__all__ diff --git a/python/paddle/static/amp/amp_nn.py b/python/paddle/static/amp/amp_nn.py index c5e812c141d3696459c6b040c30fe7e39f02629c..0f936ae8f57b9e9731173aac4efa44593993765d 100644 --- a/python/paddle/static/amp/amp_nn.py +++ b/python/paddle/static/amp/amp_nn.py @@ -18,8 +18,6 @@ from paddle.fluid.data_feeder import check_type, check_variable_and_dtype from paddle.fluid.framework import Variable, in_dygraph_mode from paddle.fluid.layer_helper import LayerHelper -__all__ = ['check_finite_and_unscale', 'update_loss_scaling'] - def check_finite_and_unscale(x, scale, name=None, float_status=None): """ diff --git a/python/paddle/static/amp/bf16/__init__.py b/python/paddle/static/amp/bf16/__init__.py index 82b616b299447c2a297d2dd5e718e8ef4a09b085..fad4a654fd88b8743720a0e469d29c95ad4a1462 100644 --- a/python/paddle/static/amp/bf16/__init__.py +++ b/python/paddle/static/amp/bf16/__init__.py @@ -24,8 +24,3 @@ from .amp_utils import ( ) from . import decorator from .decorator import decorate_bf16 - -__all__ = [] -__all__ += decorator.__all__ -__all__ += amp_lists.__all__ -__all__ += amp_utils.__all__ diff --git a/python/paddle/static/amp/bf16/amp_lists.py b/python/paddle/static/amp/bf16/amp_lists.py index d1878a3367fbc7a72a9cc90cea7028a6893ce37d..5ea5beb708b89414f6aa468f18be8dc28e073277 100644 --- a/python/paddle/static/amp/bf16/amp_lists.py +++ b/python/paddle/static/amp/bf16/amp_lists.py @@ -20,8 +20,6 @@ from ..fp16_lists import black_list as black_list_fp16 from ..fp16_lists import gray_list as gray_list_fp16 from ..fp16_lists import white_list as white_list_fp16 -__all__ = ["AutoMixedPrecisionListsBF16"] - class AutoMixedPrecisionListsBF16: """ diff --git a/python/paddle/static/amp/bf16/amp_utils.py b/python/paddle/static/amp/bf16/amp_utils.py index cf8c82127b3b45f29559444a65bf9847124ccaf3..f9a813aa44d41ccff8a035b9832fa5705fd6e17d 100644 --- a/python/paddle/static/amp/bf16/amp_utils.py +++ b/python/paddle/static/amp/bf16/amp_utils.py @@ -31,14 +31,6 @@ from ..fp16_utils import ( ) from .amp_lists import AutoMixedPrecisionListsBF16 -__all__ = [ - "bf16_guard", - "rewrite_program_bf16", - "cast_model_to_bf16", - "cast_parameters_to_bf16", - "convert_float_to_uint16", -] - _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' ) diff --git a/python/paddle/static/amp/bf16/decorator.py b/python/paddle/static/amp/bf16/decorator.py index 20286d3eebca5fead08b8e0e1f291478e0bb2080..66963e25634f09a1f73aed6df7945d726c0b9e40 100644 --- a/python/paddle/static/amp/bf16/decorator.py +++ b/python/paddle/static/amp/bf16/decorator.py @@ -25,8 +25,6 @@ from .amp_utils import ( rewrite_program_bf16, ) -__all__ = ["decorate_bf16"] - class OptimizerWithMixedPrecision: """ diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index ba33f6b391b0b825ad87ae6ecbf1a14778cf4b2e..827a3a8b599f87eed1a4ed3b66f5157915f6fbe4 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -34,8 +34,6 @@ from .fp16_utils import ( update_role_var_grad, ) -__all__ = ["decorate"] - class OptimizerWithMixedPrecision: """ diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py index b2acd0bb5156ddab063a02304dd09d199ee38c80..b3f9b0331a86c19577a09e13b54db9f6aeb57749 100644 --- a/python/paddle/static/amp/fp16_lists.py +++ b/python/paddle/static/amp/fp16_lists.py @@ -16,8 +16,6 @@ import copy from paddle.fluid import core -__all__ = ["CustomOpLists", "AutoMixedPrecisionLists"] - # lookup_table fp16 is slower than fp32, though fp16 is supported. _extra_unsupported_fp16_list = { 'lookup_table', diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py index c9cee2ab8d25cf96909896453703acfb483087e7..281d3638ee261c9bf8dd53e1c7feee1c50968545 100644 --- a/python/paddle/static/amp/fp16_utils.py +++ b/python/paddle/static/amp/fp16_utils.py @@ -23,8 +23,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager from .fp16_lists import AutoMixedPrecisionLists -__all__ = ["fp16_guard", "cast_model_to_fp16", "cast_parameters_to_fp16"] - _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' ) diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py index d21d95b097e3b6b813dcd7d05449ca97bdf1c537..d46c0c7c189b7508f2356e5ce24c56d74033e612 100644 --- a/python/paddle/static/nn/control_flow.py +++ b/python/paddle/static/nn/control_flow.py @@ -569,7 +569,7 @@ def case(pred_fn_pairs, default=None, name=None): This operator works like an if-elif-elif-else chain. Args: - pred_fn_pairs(list|tuple): A list or tuple of (pred, fn) pairs. ``pred`` is a boolean Tensor with shape [1], ``fn`` is a callable. All callables return the same structure of Tensors. + pred_fn_pairs(list|tuple): A list or tuple of (pred, fn) pairs. ``pred`` is a boolean Tensor whose numel should be 1 (shape [] or shape [1]), ``fn`` is a callable. All callables return the same structure of Tensors. default(callable, optional): Callable that returns a structure of Tensors. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. @@ -702,7 +702,7 @@ def switch_case(branch_index, branch_fns, default=None, name=None): This operator is like a C++ switch/case statement. Args: - branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``. + branch_index(Tensor): A Tensor whose numel should be 1 (shape [] or shape [1]) to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``. branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors. default(callable, optional): Callable that returns a structure of Tensors. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. @@ -910,9 +910,9 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None): branch will be executed during runtime. Args: - pred(Tensor): A boolean tensor whose numel should be 1. The boolean - value determines whether to return the result of ``true_fn`` or - ``false_fn`` . + pred(Tensor): A boolean tensor whose numel should be 1 (shape [] + or shape [1]). The boolean value determines whether to return the + result of ``true_fn`` or ``false_fn`` . true_fn(callable, optional): A callable to be performed if ``pred`` is true. The default value is ``None`` . false_fn(callable, optional): A callable to be performed if ``pred`` is @@ -969,7 +969,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None): if _non_static_mode(): assert isinstance(pred, Variable), "The pred in cond must be Variable" assert pred.size == 1, "condition input's numel should be 1" - pred = pred.numpy()[0] + pred = pred.numpy().item() if pred: if true_fn is not None: if not callable(true_fn): diff --git a/python/paddle/static/quantization/post_training_quantization.py b/python/paddle/static/quantization/post_training_quantization.py index 024c227bcae7799add5e42752f2f57106f92bdd0..6176200128c6941f24bddb7b1600f7a7aea8a1d7 100644 --- a/python/paddle/static/quantization/post_training_quantization.py +++ b/python/paddle/static/quantization/post_training_quantization.py @@ -789,7 +789,7 @@ class PostTrainingQuantization: _logger.info("MSE searching stage ...") for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue var_tensor = var_tensor.flatten() @@ -843,7 +843,7 @@ class PostTrainingQuantization: _logger.info("EMD searching stage ...") for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue var_tensor = var_tensor.flatten() @@ -899,7 +899,7 @@ class PostTrainingQuantization: for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue abs_max_value = float(np.max(np.abs(var_tensor))) @@ -940,7 +940,7 @@ class PostTrainingQuantization: for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue abs_max_value = float(np.max(np.abs(var_tensor))) @@ -975,7 +975,7 @@ class PostTrainingQuantization: for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue min_value = float(np.min(var_tensor)) @@ -992,7 +992,7 @@ class PostTrainingQuantization: def _sample_histogram(self): for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if (not var_tensor.any()) or ( + if (var_tensor.size == 0) or ( var_name not in self._sampling_act_histogram ): self._zero_size_var_names.add(var_name) @@ -1031,7 +1031,7 @@ class PostTrainingQuantization: for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue abs_max_value = float(np.max(np.abs(var_tensor))) @@ -1094,7 +1094,7 @@ class PostTrainingQuantization: ''' for var_name in self._quantized_act_var_name: var_tensor = utils.load_variable_data(self._scope, var_name) - if not var_tensor.any(): + if var_tensor.size == 0: self._zero_size_var_names.add(var_name) continue var_tensor = np.abs(var_tensor) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index a5492d508103736ef44c8bb4586254ca2910838f..4cce1b01968a196250d9346ec4a4e173e21f6892 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1296,7 +1296,7 @@ def t(input, name=None): "tensor.transpose() instead." % len(input.shape) ) if in_dygraph_mode(): - if len(input.shape) == 1: + if len(input.shape) <= 1: return input # 2-D tensor perm = [1, 0] @@ -1313,7 +1313,7 @@ def t(input, name=None): helper = LayerHelper('t', **locals()) out = helper.create_variable_for_type_inference(input.dtype) input_shape = helper.create_variable_for_type_inference(input.dtype) - if len(input.shape) == 1: + if len(input.shape) <= 1: out = input else: helper.append_op( diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 36e3de3f53d449a7afb3859e17260753112b66ff..375f3614e5e30c827a15f25b21d9897f73805002 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -17,11 +17,11 @@ import paddle from ..fluid.data_feeder import check_type, check_variable_and_dtype -from ..fluid.framework import _in_eager_mode_ +from ..fluid.framework import global_var from ..static import Variable from .layer_function_generator import templatedoc -if _in_eager_mode_: +if global_var._in_eager_mode_: Tensor = paddle.fluid.framework.core.eager.Tensor else: from ..framework import VarBase as Tensor diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 0a898caa3fb2f38791ca2e73215674d7d95ceefa..f5c57a312d84583bcf367ccf85666ce80363a2ec 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -623,6 +623,7 @@ HIGH_PARALLEL_JOB_NEW = [ 'test_dataset_consistency_inspection', 'test_cuda_empty_cache', 'test_cuda_graph', + 'test_cuda_graph_static_mode', 'test_disable_signal_handler', 'test_eig_op', 'test_eigh_op', @@ -2509,6 +2510,7 @@ TETRAD_PARALLEL_JOB = [ 'test_dlpack', 'test_complex_variable', 'test_cuda_graph', + 'test_cuda_graph_static_mode', 'test_custom_grad_input', 'test_accuracy_op', 'test_pool1d_api',