提交 184fa04c 编写于 作者: W wangruting

fix_conflict

...@@ -10,6 +10,7 @@ English | [简体中文](./README_cn.md) ...@@ -10,6 +10,7 @@ English | [简体中文](./README_cn.md)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
[![Twitter](https://img.shields.io/badge/Twitter-1ca0f1.svg?logo=twitter&logoColor=white)](https://twitter.com/PaddlePaddle_)
Welcome to the PaddlePaddle GitHub. Welcome to the PaddlePaddle GitHub.
......
...@@ -23,7 +23,7 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) ...@@ -23,7 +23,7 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
# in case of low internet speed # in case of low internet speed
#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) #set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
set(WARPCTC_INCLUDE_DIR set(WARPCTC_INCLUDE_DIR
"${WARPCTC_INSTALL_DIR}/include" "${WARPCTC_INSTALL_DIR}/include"
......
...@@ -26,7 +26,7 @@ function(find_register FILENAME PATTERN OUTPUT) ...@@ -26,7 +26,7 @@ function(find_register FILENAME PATTERN OUTPUT)
PARENT_SCOPE) PARENT_SCOPE)
endfunction() endfunction()
function(find_phi_register FILENAME ADD_PATH) function(find_phi_register FILENAME ADD_PATH PATTERN)
# set op_name to OUTPUT # set op_name to OUTPUT
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
...@@ -36,11 +36,11 @@ function(find_phi_register FILENAME ADD_PATH) ...@@ -36,11 +36,11 @@ function(find_phi_register FILENAME ADD_PATH)
string( string(
REGEX REGEX
MATCH MATCH
"PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*" "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
register register
"${CONTENT}") "${CONTENT}")
if(NOT register STREQUAL "") if(NOT register STREQUAL "")
string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}") string(REPLACE "${PATTERN}(" "" register "${register}")
string(REPLACE "," ";" register "${register}") string(REPLACE "," ";" register "${register}")
string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}") string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}")
string(REGEX REPLACE "//cuda_only" "" register "${register}") string(REGEX REPLACE "//cuda_only" "" register "${register}")
...@@ -401,7 +401,8 @@ function(op_library TARGET) ...@@ -401,7 +401,8 @@ function(op_library TARGET)
# pybind USE_OP_ITSELF # pybind USE_OP_ITSELF
set(op_name "") set(op_name "")
# Add PHI Kernel Registry Message # Add PHI Kernel Registry Message
find_phi_register(${cc_src} ${pybind_file}) find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_KERNEL")
find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
find_register(${cc_src} "REGISTER_OPERATOR" op_name) find_register(${cc_src} "REGISTER_OPERATOR" op_name)
if(NOT ${op_name} EQUAL "") if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n") file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
...@@ -440,7 +441,8 @@ function(op_library TARGET) ...@@ -440,7 +441,8 @@ function(op_library TARGET)
foreach(cu_src ${cu_srcs}) foreach(cu_src ${cu_srcs})
set(op_name "") set(op_name "")
# Add PHI Kernel Registry Message # Add PHI Kernel Registry Message
find_phi_register(${cu_src} ${pybind_file}) find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL")
find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name) find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
if(NOT ${op_name} EQUAL "") if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
......
...@@ -446,7 +446,8 @@ endif() ...@@ -446,7 +446,8 @@ endif()
if(WITH_DISTRIBUTE if(WITH_DISTRIBUTE
AND NOT WITH_PSLIB AND NOT WITH_PSLIB
AND NOT WITH_PSCORE) AND NOT WITH_PSCORE
AND NOT WITH_RPC)
include(external/snappy) include(external/snappy)
list(APPEND third_party_deps extern_snappy) list(APPEND third_party_deps extern_snappy)
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/distributed/collective/bkcl_tools.h" #include "paddle/fluid/distributed/collective/bkcl_tools.h"
#include "paddle/fluid/distributed/collective/common.h" #include "paddle/fluid/distributed/collective/common.h"
#include "paddle/fluid/distributed/collective/utils.h"
#include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
...@@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() { ...@@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() {
PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_end()); PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_end());
} }
std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
phi::DenseTensor* tensor,
int src_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) {
// numel > 0 indicates the tensor need to be sliced
phi::DenseTensor partial_tensor;
if (numel > 0) {
partial_tensor = GetPartialTensor(*tensor, offset, numel);
tensor = &partial_tensor;
}
return Collective(
tensor,
// have to pass a tensor here
// TODO(zhangxiaoci) catch up with nccl's api
*tensor,
[&](phi::DenseTensor* output,
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
return bkcl_recv(comm,
output->data(),
output->numel(),
src_rank,
platform::ToBKCLDataType(
framework::TransToProtoVarType(output->type())),
stream);
},
CommType::RECV,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
const phi::DenseTensor& tensor,
int dst_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) {
// numel > 0 indicates the tensor need to be sliced
const phi::DenseTensor& tensor_maybe_partial =
numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor;
return Collective(
nullptr,
tensor_maybe_partial,
[&](phi::DenseTensor* output,
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
return bkcl_send(comm,
input.data(),
input.numel(),
dst_rank,
platform::ToBKCLDataType(
framework::TransToProtoVarType(input.type())),
stream);
},
CommType::SEND,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroupBKCL::BKCLTask> ProcessGroupBKCL::CreateTask( std::shared_ptr<ProcessGroupBKCL::BKCLTask> ProcessGroupBKCL::CreateTask(
const Place& place, const Place& place,
int rank, int rank,
...@@ -136,6 +204,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place, ...@@ -136,6 +204,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place,
BKCLContext_t bkcl_comm; BKCLContext_t bkcl_comm;
BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id)); BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id));
comm_ctx->SetBkclContext(bkcl_comm); comm_ctx->SetBkclContext(bkcl_comm);
// comm context creates a separate XPU stream for communication
comm_ctx->CreateStream();
place_to_calc_ctx_[place_key] = calc_ctx; place_to_calc_ctx_[place_key] = calc_ctx;
place_to_comm_ctx_[place_key] = std::move(comm_ctx); place_to_comm_ctx_[place_key] = std::move(comm_ctx);
......
...@@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream { ...@@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
phi::DeviceContext* GetDeviceContext(const Place& place, phi::DeviceContext* GetDeviceContext(const Place& place,
bool use_calc_stream) const override; bool use_calc_stream) const override;
std::shared_ptr<ProcessGroup::Task> AllReduce( std::shared_ptr<ProcessGroup::Task> AllGather(
phi::DenseTensor* out_tensor, phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor, const phi::DenseTensor& in_tensor,
const AllreduceOptions& opts, int64_t offset, // for compatibility, no use now
int64_t numel, // for compatibility, no use now
bool sync_op, bool sync_op,
bool use_calc_stream) override; bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Broadcast( std::shared_ptr<ProcessGroup::Task> AllReduce(
phi::DenseTensor* out_tensor, phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor, const phi::DenseTensor& in_tensor,
const BroadcastOptions& opts, const AllreduceOptions& opts,
bool sync_op, bool sync_op,
bool use_calc_stream) override; bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> AllGather( std::shared_ptr<ProcessGroup::Task> Broadcast(
phi::DenseTensor* out_tensor, phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor, const phi::DenseTensor& in_tensor,
int64_t offset, // for compatibility, no use now const BroadcastOptions& opts,
int64_t numel, // for compatibility, no use now
bool sync_op, bool sync_op,
bool use_calc_stream) override; bool use_calc_stream) override;
...@@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream { ...@@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
bool sync_op, bool sync_op,
bool use_calc_stream) override; bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
int src_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
int dst_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Barrier( std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override; const BarrierOptions& = BarrierOptions()) override;
......
...@@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() { ...@@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() {
// run_per_steps_, run_at_offset_ // run_per_steps_, run_at_offset_
// 4, 0 --> run at step 0, 4, 8, 12 // 4, 0 --> run at step 0, 4, 8, 12
// 4, 3 --> run at step 3, 7, 11, 15 // 4, 3 --> run at step 3, 7, 11, 15
if ((step_ % run_per_steps_) == run_at_offset_) { if ((cur_scope_id_ % run_per_steps_) == run_at_offset_) {
ComputeInterceptor::RunOps(); ComputeInterceptor::RunOps();
} }
} }
...@@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() { ...@@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() {
void AmplifierInterceptor::SendDataReadyToDownStream() { void AmplifierInterceptor::SendDataReadyToDownStream() {
// run multi times, send ready one times to downstream, that is // run multi times, send ready one times to downstream, that is
// input multi times, output one times // input multi times, output one times
if (step_ % send_down_per_steps_ == 0) { if (cur_scope_id_ % send_down_per_steps_ == 0) {
ComputeInterceptor::SendDataReadyToDownStream(); ComputeInterceptor::SendDataReadyToDownStream();
} }
} }
...@@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() { ...@@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() {
void AmplifierInterceptor::ReplyCompletedToUpStream() { void AmplifierInterceptor::ReplyCompletedToUpStream() {
// run multi times, reply one times to upstream, that is // run multi times, reply one times to upstream, that is
// input one times, output multi times // input one times, output multi times
if (step_ % reply_up_per_steps_ == 0) { if (cur_scope_id_ % reply_up_per_steps_ == 0) {
ComputeInterceptor::ReplyCompletedToUpStream(); ComputeInterceptor::ReplyCompletedToUpStream();
} }
} }
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
class AmplifierInterceptor : public ComputeInterceptor { class AmplifierInterceptor final : public ComputeInterceptor {
public: public:
AmplifierInterceptor(int64_t interceptor_id, TaskNode* node); AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
......
...@@ -71,6 +71,9 @@ void Carrier::Init( ...@@ -71,6 +71,9 @@ void Carrier::Init(
microbatch_scopes_[i] = &minibatch_scope_->NewScope(); microbatch_scopes_[i] = &minibatch_scope_->NewScope();
CopyParameters(i, program, inference_root_scope_vars); CopyParameters(i, program, inference_root_scope_vars);
} }
// Add source and sink interceptor id to rank
interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
interceptor_id_to_rank_.emplace(SINK_ID, rank);
// TODO(fleet_exe dev): thread pool // TODO(fleet_exe dev): thread pool
thread_num_ = 1; thread_num_ = 1;
...@@ -159,16 +162,10 @@ void Carrier::Start() { ...@@ -159,16 +162,10 @@ void Carrier::Start() {
true, true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Using carrier before initialized.")); "Using carrier before initialized."));
for (int64_t id : source_interceptor_ids_) { InterceptorMessage start_msg;
VLOG(3) << "Carrier Start is sending start to source interceptor " << id start_msg.set_dst_id(SOURCE_ID);
<< "."; start_msg.set_message_type(START);
InterceptorMessage start_msg; Send(start_msg);
// source node data_is_ready is send by carrier, so set src_id=-1
start_msg.set_src_id(-1);
start_msg.set_dst_id(id);
start_msg.set_message_type(DATA_IS_READY);
Send(start_msg);
}
// TODO(wangxi): async step // TODO(wangxi): async step
Wait(); Wait();
dev_ctx_->Wait(); dev_ctx_->Wait();
...@@ -270,6 +267,38 @@ void Carrier::CreateInterceptors() { ...@@ -270,6 +267,38 @@ void Carrier::CreateInterceptors() {
auto gc = GetGC(place_); auto gc = GetGC(place_);
// create source and sink task node
auto max_run_times = microbatch_scopes_.size();
TaskNode* source = new TaskNode(
rank_, SOURCE_ID, max_run_times); // rank, task_id, max_run_times
TaskNode* sink = new TaskNode(rank_, SINK_ID, max_run_times);
// find nodes without upstreams or without downstreams
std::vector<TaskNode*> origin_sources, origin_sinks;
for (const auto& item : interceptor_id_to_node_) {
TaskNode* task_node = item.second;
if (task_node->upstream().empty()) {
origin_sources.emplace_back(task_node);
}
if (task_node->downstream().empty()) {
origin_sinks.emplace_back(task_node);
}
}
// link source node with origin source
for (const auto& node : origin_sources) {
source->AddDownstreamTask(node->task_id(),
std::numeric_limits<int64_t>::max());
node->AddUpstreamTask(SOURCE_ID, std::numeric_limits<int64_t>::max());
}
// link sink node with origin sink
for (const auto& node : origin_sinks) {
sink->AddUpstreamTask(node->task_id(), std::numeric_limits<int64_t>::max());
node->AddDownstreamTask(SINK_ID, std::numeric_limits<int64_t>::max());
}
// create source and sink interceptor
SetInterceptor(SOURCE_ID,
InterceptorFactory::Create("Source", SOURCE_ID, source));
SetInterceptor(SINK_ID, InterceptorFactory::Create("Sink", SINK_ID, sink));
// create each Interceptor // create each Interceptor
// no auto init since there is no config // no auto init since there is no config
for (const auto& item : interceptor_id_to_node_) { for (const auto& item : interceptor_id_to_node_) {
...@@ -303,9 +332,15 @@ void Carrier::CreateInterceptors() { ...@@ -303,9 +332,15 @@ void Carrier::CreateInterceptors() {
VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
<< " with type: " << task_node->type() << "."; << " with type: " << task_node->type() << ".";
if (task_node->upstream().empty()) { PADDLE_ENFORCE_EQ(
source_interceptor_ids_.emplace_back(interceptor_id); task_node->upstream().empty(),
} false,
platform::errors::PreconditionNotMet(
"There should not have normal nodes as source nodes"));
PADDLE_ENFORCE_EQ(task_node->downstream().empty(),
false,
platform::errors::PreconditionNotMet(
"There should not have normal nodes as sink nodes"));
} }
} }
......
...@@ -100,8 +100,6 @@ class Carrier final { ...@@ -100,8 +100,6 @@ class Carrier final {
std::unordered_map<int64_t, std::unique_ptr<Interceptor>> std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
interceptor_idx_to_interceptor_; interceptor_idx_to_interceptor_;
std::vector<int64_t> source_interceptor_ids_;
bool is_init_{false}; bool is_init_{false};
std::mutex running_mutex_; std::mutex running_mutex_;
......
...@@ -34,29 +34,10 @@ void ComputeInterceptor::PrepareDeps() { ...@@ -34,29 +34,10 @@ void ComputeInterceptor::PrepareDeps() {
for (auto up : upstream) { for (auto up : upstream) {
in_readys_.emplace(up.first, std::make_pair(up.second, 0)); in_readys_.emplace(up.first, std::make_pair(up.second, 0));
in_stops_.emplace(up.first, false);
} }
for (auto down : downstream) { for (auto down : downstream) {
out_buffs_.emplace(down.first, std::make_pair(down.second, 0)); out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
} }
// source compute node, should we add a new SourceInterceptor?
if (upstream.empty()) {
is_source_ = true;
PADDLE_ENFORCE_GT(node_->max_run_times(),
0,
platform::errors::InvalidArgument(
"Source ComputeInterceptor must run at least one "
"times, but now max_run_times=%ld",
node_->max_run_times()));
in_readys_.emplace(-1,
std::make_pair(std::numeric_limits<int64_t>::max(), 0));
}
// If there is no downstream or every downstream is in different rank,
// then this interceptor is the last one for current rank.
// This can be get during init, can be cached for later use.
is_last_ = downstream.empty();
} }
void ComputeInterceptor::IncreaseReady(int64_t up_id) { void ComputeInterceptor::IncreaseReady(int64_t up_id) {
...@@ -66,12 +47,6 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) { ...@@ -66,12 +47,6 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
platform::errors::NotFound( platform::errors::NotFound(
"Cannot find upstream=%lld in in_readys.", up_id)); "Cannot find upstream=%lld in in_readys.", up_id));
// source node has no upstream, data_is_ready is send by carrier or others
if (is_source_ && up_id == -1) {
it->second.second += GetTaskNode()->max_run_times();
return;
}
auto max_ready_size = it->second.first; auto max_ready_size = it->second.first;
auto ready_size = it->second.second; auto ready_size = it->second.second;
ready_size += 1; ready_size += 1;
...@@ -152,7 +127,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() { ...@@ -152,7 +127,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
ready_msg.set_message_type(DATA_IS_READY); ready_msg.set_message_type(DATA_IS_READY);
VLOG(3) << "ComputeInterceptor " << interceptor_id_ VLOG(3) << "ComputeInterceptor " << interceptor_id_
<< " Send data_is_ready msg to " << down_id << " Send data_is_ready msg to " << down_id
<< " for step: " << step_; << " in scope: " << cur_scope_id_;
Send(down_id, ready_msg); Send(down_id, ready_msg);
} }
} }
...@@ -173,8 +148,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { ...@@ -173,8 +148,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
VLOG(3) << "ComputeInterceptor " << interceptor_id_ VLOG(3) << "ComputeInterceptor " << interceptor_id_
<< " Reply data_is_useless msg to " << up_id << " Reply data_is_useless msg to " << up_id
<< " for step: " << step_; << " in scope: " << cur_scope_id_;
if (is_source_ && up_id == -1) return;
InterceptorMessage reply_msg; InterceptorMessage reply_msg;
reply_msg.set_message_type(DATA_IS_USELESS); reply_msg.set_message_type(DATA_IS_USELESS);
...@@ -183,16 +157,20 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { ...@@ -183,16 +157,20 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
} }
void ComputeInterceptor::RunOps() { void ComputeInterceptor::RunOps() {
VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
<< step_ + 1 << " time.";
for (auto op : node_->ops()) { for (auto op : node_->ops()) {
op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_); PADDLE_ENFORCE_LT(cur_scope_id_,
microbatch_scopes_.size(),
platform::errors::InvalidArgument(
"Step out of range. There are %ld "
"microbatch_scopes, but recevice scope index %ld",
microbatch_scopes_.size(),
cur_scope_id_));
op->Run(*microbatch_scopes_[cur_scope_id_], place_);
if (gc_) { if (gc_) {
framework::DeleteUnusedTensors( framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
*microbatch_scopes_[step_ % node_->max_run_times()], op,
op, node_->unused_vars(),
node_->unused_vars(), gc_.get());
gc_.get());
} }
} }
} }
...@@ -201,77 +179,28 @@ void ComputeInterceptor::Run() { ...@@ -201,77 +179,28 @@ void ComputeInterceptor::Run() {
while (IsInputReady() && CanWriteOutput()) { while (IsInputReady() && CanWriteOutput()) {
VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running"; VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
// get the ready scope id from queue
cur_scope_id_ = ready_queue_.front();
ready_queue_.pop();
RunOps(); RunOps();
++step_;
// send to downstream and increase buff used // send to downstream and increase buff used
SendDataReadyToDownStream(); SendDataReadyToDownStream();
// reply to upstream and decrease ready data // reply to upstream and decrease ready data
ReplyCompletedToUpStream(); ReplyCompletedToUpStream();
// Try to stop Carrier
if (is_last_ && (step_ % node_->max_run_times() == 0)) {
VLOG(3) << "Interceptor " << GetInterceptorId()
<< " is stopping carrier.";
// FIXME(wangxi): with multi sink interceptor
StopCarrier();
}
}
}
void ComputeInterceptor::ReceivedStop(int64_t up_id) {
received_stop_ = true;
// source node has no upstream, stop is send by carrier or others
if (is_source_ && up_id == -1) return;
auto it = in_stops_.find(up_id);
PADDLE_ENFORCE_NE(it,
in_stops_.end(),
platform::errors::NotFound(
"Cannot find upstream=%lld in in_stops.", up_id));
PADDLE_ENFORCE_EQ(
it->second,
false,
platform::errors::AlreadyExists("Already received stop from %lld, stop "
"cannot be send more than once."));
it->second = true;
}
void ComputeInterceptor::TryStop() {
if (!received_stop_) return;
// can stop only when all upstream is stop and
// downstream complete
for (auto& in_stop : in_stops_) {
if (!in_stop.second) return;
}
for (auto& out_buff : out_buffs_) {
auto used_size = out_buff.second.second;
if (used_size != 0) return;
} }
// send stop to downstream
for (auto& out : out_buffs_) {
auto down_id = out.first;
InterceptorMessage stop;
stop.set_message_type(STOP);
Send(down_id, stop);
}
stop_ = true;
} }
void ComputeInterceptor::Compute(const InterceptorMessage& msg) { void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
if (msg.message_type() == DATA_IS_READY) { if (msg.message_type() == DATA_IS_READY) {
IncreaseReady(msg.src_id()); IncreaseReady(msg.src_id());
ready_queue_.push(msg.scope_idx());
Run(); Run();
} else if (msg.message_type() == DATA_IS_USELESS) { } else if (msg.message_type() == DATA_IS_USELESS) {
DecreaseBuff(msg.src_id()); DecreaseBuff(msg.src_id());
Run(); Run();
} else if (msg.message_type() == STOP) {
ReceivedStop(msg.src_id());
} }
TryStop();
} }
REGISTER_INTERCEPTOR(Compute, ComputeInterceptor); REGISTER_INTERCEPTOR(Compute, ComputeInterceptor);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <queue>
#include <utility> #include <utility>
#include "paddle/fluid/distributed/fleet_executor/interceptor.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
...@@ -30,7 +31,8 @@ class ComputeInterceptor : public Interceptor { ...@@ -30,7 +31,8 @@ class ComputeInterceptor : public Interceptor {
virtual void SendDataReadyToDownStream(); virtual void SendDataReadyToDownStream();
virtual void ReplyCompletedToUpStream(); virtual void ReplyCompletedToUpStream();
int64_t step_{0}; std::queue<int64_t> ready_queue_;
int64_t cur_scope_id_;
private: private:
void PrepareDeps(); void PrepareDeps();
...@@ -43,19 +45,10 @@ class ComputeInterceptor : public Interceptor { ...@@ -43,19 +45,10 @@ class ComputeInterceptor : public Interceptor {
void Run(); void Run();
void Compute(const InterceptorMessage& msg); void Compute(const InterceptorMessage& msg);
void ReceivedStop(int64_t up_id);
void TryStop();
bool is_source_{false};
bool is_last_{false};
// upstream_id-->(max_ready_size, ready_size) // upstream_id-->(max_ready_size, ready_size)
std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{}; std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
// downstream_id-->(max_buffer_size, used_size) // downstream_id-->(max_buffer_size, used_size)
std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{}; std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
bool received_stop_{false};
std::map<int64_t, bool> in_stops_{};
}; };
} // namespace distributed } // namespace distributed
......
...@@ -93,7 +93,6 @@ class Interceptor { ...@@ -93,7 +93,6 @@ class Interceptor {
TaskNode* node_; TaskNode* node_;
// for stop // for stop
bool stop_{false};
void StopCarrier(); void StopCarrier();
// for runtime // for runtime
...@@ -114,9 +113,6 @@ class Interceptor { ...@@ -114,9 +113,6 @@ class Interceptor {
std::mutex mutex_; std::mutex mutex_;
std::deque<InterceptorMessage> messages_; std::deque<InterceptorMessage> messages_;
int64_t already_run_times_{0};
int64_t used_slot_nums_{0};
}; };
class InterceptorFactory { class InterceptorFactory {
......
...@@ -25,7 +25,7 @@ namespace distributed { ...@@ -25,7 +25,7 @@ namespace distributed {
* 1. record the num of micro-step * 1. record the num of micro-step
* 2. check whether to notify carrier the current step is finished * 2. check whether to notify carrier the current step is finished
*/ */
class SinkInterceptor : public Interceptor { class SinkInterceptor final : public Interceptor {
public: public:
SinkInterceptor(int64_t interceptor_id, TaskNode* node); SinkInterceptor(int64_t interceptor_id, TaskNode* node);
......
...@@ -25,7 +25,7 @@ namespace distributed { ...@@ -25,7 +25,7 @@ namespace distributed {
* 1. receive `start` message from carrier * 1. receive `start` message from carrier
* 2. send num_of_steps `data_is_ready` message to downstream * 2. send num_of_steps `data_is_ready` message to downstream
*/ */
class SourceInterceptor : public Interceptor { class SourceInterceptor final : public Interceptor {
public: public:
SourceInterceptor(int64_t interceptor_id, TaskNode* node); SourceInterceptor(int64_t interceptor_id, TaskNode* node);
......
...@@ -25,57 +25,42 @@ limitations under the License. */ ...@@ -25,57 +25,42 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
class StartInterceptor : public Interceptor {
public:
StartInterceptor(int64_t interceptor_id, TaskNode* node)
: Interceptor(interceptor_id, node) {
RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); });
}
void NOP(const InterceptorMessage& msg) {
if (msg.message_type() == STOP) {
stop_ = true;
InterceptorMessage stop;
stop.set_message_type(STOP);
Send(1, stop); // stop 1, compute
return;
}
std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
<< std::endl;
}
};
TEST(ComputeInterceptor, Compute) { TEST(ComputeInterceptor, Compute) {
std::string carrier_id = "0"; std::string carrier_id = "0";
Carrier* carrier = Carrier* carrier =
GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id); GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}}); carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}});
MessageBus* msg_bus = GlobalVal<MessageBus>::Create(); MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
msg_bus->Init(0, {{0, "127.0.0.0:0"}}, ""); msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
// NOTE: don't delete, otherwise interceptor will use undefined node // NOTE: don't delete, otherwise interceptor will use undefined node
TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0); // role, rank, task_id TaskNode* source =
new TaskNode(0, SOURCE_ID, 3); // rank, task_id, max_run_times
TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);
TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0); TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0); TaskNode* sink = new TaskNode(0, SINK_ID, 3);
// a->b->c // source->a->b->sink
source->AddDownstreamTask(0);
node_a->AddUpstreamTask(SOURCE_ID);
node_a->AddDownstreamTask(1, 3); node_a->AddDownstreamTask(1, 3);
node_b->AddUpstreamTask(0, 3); node_b->AddUpstreamTask(0, 3);
node_b->AddDownstreamTask(2); node_b->AddDownstreamTask(SINK_ID);
node_c->AddUpstreamTask(1); sink->AddUpstreamTask(1);
Interceptor* a = carrier->SetInterceptor(
carrier->SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a)); SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b)); carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c)); carrier->SetInterceptor(SINK_ID,
InterceptorFactory::Create("Sink", SINK_ID, sink));
// start
InterceptorMessage msg; InterceptorMessage msg;
msg.set_message_type(DATA_IS_READY); msg.set_message_type(START);
// test run three times msg.set_dst_id(SOURCE_ID);
a->Send(1, msg); carrier->EnqueueInterceptorMessage(msg);
a->Send(1, msg);
a->Send(1, msg);
carrier->Wait(); carrier->Wait();
carrier->Release(); carrier->Release();
......
...@@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor { ...@@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor {
void PingPong(const InterceptorMessage& msg) { void PingPong(const InterceptorMessage& msg) {
if (msg.message_type() == STOP) { if (msg.message_type() == STOP) {
stop_ = true;
return; return;
} }
std::cout << GetInterceptorId() << " recv msg, count=" << count_ std::cout << GetInterceptorId() << " recv msg, count=" << count_
......
...@@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor { ...@@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor {
void PingPong(const InterceptorMessage& msg) { void PingPong(const InterceptorMessage& msg) {
if (msg.message_type() == STOP) { if (msg.message_type() == STOP) {
stop_ = true;
StopCarrier(); StopCarrier();
return; return;
} }
......
...@@ -22,6 +22,10 @@ if(WITH_ROCM) ...@@ -22,6 +22,10 @@ if(WITH_ROCM)
target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB}) target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
endif() endif()
if(WITH_CINN)
target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
endif()
# Prepare file structure # Prepare file structure
message( message(
"Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated" "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated"
......
...@@ -47,7 +47,7 @@ void SetFeedVariable(Scope* scope, ...@@ -47,7 +47,7 @@ void SetFeedVariable(Scope* scope,
} }
void SetFeedVariable(Scope* scope, void SetFeedVariable(Scope* scope,
const Strings& input, const std::vector<std::string>& input,
const std::string& var_name, const std::string& var_name,
size_t index) { size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will // If var_name Variable is not found in GlobalScope, a new variable will
...@@ -59,7 +59,7 @@ void SetFeedVariable(Scope* scope, ...@@ -59,7 +59,7 @@ void SetFeedVariable(Scope* scope,
feed_inputs.resize(index + 1); feed_inputs.resize(index + 1);
} }
// shared data with input tensor // shared data with input tensor
feed_inputs[index] = input; feed_inputs[index] = Strings(input);
} }
FetchType& GetFetchVariable(const Scope& scope, FetchType& GetFetchVariable(const Scope& scope,
......
...@@ -35,7 +35,7 @@ void SetFeedVariable(Scope* scope, ...@@ -35,7 +35,7 @@ void SetFeedVariable(Scope* scope,
size_t index); size_t index);
void SetFeedVariable(Scope* scope, void SetFeedVariable(Scope* scope,
const Strings& input, const std::vector<std::string>& input,
const std::string& var_name, const std::string& var_name,
size_t index); size_t index);
......
...@@ -19,12 +19,14 @@ limitations under the License. */ ...@@ -19,12 +19,14 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/string_array.h"
#include "paddle/phi/core/extended_tensor.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using FeedType = using FeedType =
paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>; paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>;
using FeedList = std::vector<FeedType>;
using FeedList = paddle::framework::PhiVector<FeedType>;
using FetchType = paddle::variant<phi::DenseTensor, using FetchType = paddle::variant<phi::DenseTensor,
LoDTensorArray, LoDTensorArray,
......
...@@ -117,6 +117,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { ...@@ -117,6 +117,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
return var_type == proto::VarType::SPARSE_COO; return var_type == proto::VarType::SPARSE_COO;
} }
bool IsSparseCooTensorOutput(const std::string& name) const override {
auto var_types = ctx_.GetOutputsVarType(name);
return std::all_of(var_types.begin(),
var_types.end(),
[](const proto::VarType::Type& type) {
return type == proto::VarType::SPARSE_COO;
});
}
bool IsSparseCsrTensorInput(const std::string& name) const override { bool IsSparseCsrTensorInput(const std::string& name) const override {
auto var_type = ctx_.GetInputVarType(name); auto var_type = ctx_.GetInputVarType(name);
return var_type == proto::VarType::SPARSE_CSR; return var_type == proto::VarType::SPARSE_CSR;
......
...@@ -126,6 +126,45 @@ void InferShapeUtilsTestKernel(const Context& dev_ctx, ...@@ -126,6 +126,45 @@ void InferShapeUtilsTestKernel(const Context& dev_ctx,
VLOG(6) << "Come into InferShapeUtilsTestKernel"; VLOG(6) << "Come into InferShapeUtilsTestKernel";
} }
void TestOutputInferMeta(const phi::MetaTensor& x, phi::MetaTensor* out) {
ASSERT_EQ(x.dtype(), phi::DataType::FLOAT32);
}
class InferShapeUtilsTestOutputOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
AddOutput("Out", "output of test op");
AddComment("This is test op");
}
};
class InferShapeUtilsTestOutputOp : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;
phi::KernelKey GetExpectedKernelType(
const ExecutionContext& ctx) const override {
return phi::KernelKey(proto::VarType::FP32, ctx.GetPlace());
}
};
phi::KernelSignature TestSparseOutputOpArgumentMapping(
const phi::ArgumentMappingContext& ctx) {
if (ctx.IsSparseCooTensorOutput("Out")) {
return phi::KernelSignature(
"test_sparse_coo_tensor_output", {"X"}, {}, {"Out"});
}
return phi::KernelSignature("test_output", {"X"}, {}, {"Out"});
}
template <typename T, typename Context>
void InferShapeUtilsTestOutputKernel(const Context& dev_ctx,
const phi::DenseTensor& x,
phi::SparseCooTensor* out) {
VLOG(6) << "Come into InferShapeUtilsTestOutputKernel";
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -143,6 +182,21 @@ PD_REGISTER_KERNEL(infer_shape_utils_test, ...@@ -143,6 +182,21 @@ PD_REGISTER_KERNEL(infer_shape_utils_test,
paddle::framework::InferShapeUtilsTestKernel, paddle::framework::InferShapeUtilsTestKernel,
int) {} int) {}
DECLARE_INFER_SHAPE_FUNCTOR(
infer_shape_utils_test_output,
InferShapeUtilsTestOutputInferShapeFunctor,
PD_INFER_META(paddle::framework::TestOutputInferMeta));
REGISTER_OPERATOR(infer_shape_utils_test_output,
paddle::framework::InferShapeUtilsTestOutputOp,
paddle::framework::InferShapeUtilsTestOutputOpMaker,
InferShapeUtilsTestOutputInferShapeFunctor);
PD_REGISTER_KERNEL(test_sparse_coo_tensor_output,
CPU,
ALL_LAYOUT,
paddle::framework::InferShapeUtilsTestOutputKernel,
int) {}
TEST(InferShapeUtilsTest, ALL) { TEST(InferShapeUtilsTest, ALL) {
paddle::framework::ProgramDesc prog; paddle::framework::ProgramDesc prog;
paddle::framework::proto::BlockDesc proto_block; paddle::framework::proto::BlockDesc proto_block;
...@@ -200,3 +254,27 @@ TEST(InferShapeUtilsTest, ALL) { ...@@ -200,3 +254,27 @@ TEST(InferShapeUtilsTest, ALL) {
op->InferShape(block_desc); op->InferShape(block_desc);
} }
TEST(InferShapeUtilsTestOutput, ALL) {
paddle::framework::ProgramDesc prog;
paddle::framework::proto::BlockDesc proto_block;
paddle::framework::BlockDesc block_desc(&prog, &proto_block);
auto* op = block_desc.AppendOp();
op->SetType("infer_shape_utils_test_output");
auto* x = block_desc.Var("x");
x->SetType(paddle::framework::proto::VarType::LOD_TENSOR);
x->SetDataType(paddle::framework::proto::VarType::FP32);
op->SetInput("X", {"x"});
auto* out = block_desc.Var("out");
out->SetType(paddle::framework::proto::VarType::SPARSE_COO);
op->SetOutput("Out", {"out"});
phi::OpUtilsMap::Instance().InsertArgumentMappingFn(
"infer_shape_utils_test_output",
paddle::framework::TestSparseOutputOpArgumentMapping);
op->InferShape(block_desc);
}
...@@ -460,14 +460,6 @@ if(WITH_MKLDNN) ...@@ -460,14 +460,6 @@ if(WITH_MKLDNN)
test_cpu_quantize_squash_pass test_cpu_quantize_squash_pass
SRCS mkldnn/cpu_quantize_squash_pass_tester.cc SRCS mkldnn/cpu_quantize_squash_pass_tester.cc
DEPS cpu_quantize_squash_pass naive_executor) DEPS cpu_quantize_squash_pass naive_executor)
cc_test(
test_reshape_transpose_matmul_mkldnn_fuse_pass
SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
cc_test(
test_matmul_transpose_reshape_fuse_pass
SRCS mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
DEPS matmul_transpose_reshape_mkldnn_fuse_pass)
cc_test( cc_test(
test_shuffle_channel_mkldnn_detect_pass test_shuffle_channel_mkldnn_detect_pass
SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc *prog,
const std::string &type,
const std::vector<std::string> &inputs,
const std::vector<std::string> &outputs) {
auto *op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetInput("X", {inputs[0]});
op->SetOutput("Out", {outputs[0]});
if (type == "transpose2") {
op->SetAttr("axis", std::vector<int>({0, 2, 1, 3}));
op->SetOutput("XShape", {outputs[1]});
}
if (type == "reshape2") {
op->SetAttr("shape", std::vector<int>({4, 5, 6}));
op->SetOutput("XShape", {outputs[1]});
}
if (type == "matmul") {
op->SetInput("Y", {inputs[1]});
op->SetAttr("use_mkldnn", true);
op->SetAttr("alpha", 1.0f);
op->SetAttr("transpose_X", true);
op->SetAttr("transpose_Y", true);
}
if (type == "matmul_v2") {
op->SetInput("Y", {inputs[1]});
op->SetAttr("use_mkldnn", true);
op->SetAttr("trans_x", true);
op->SetAttr("trans_y", true);
}
}
ProgramDesc BuildProgramDesc(const std::string &op_name) {
ProgramDesc prog;
for (auto &v : std::initializer_list<std::string>(
{"a1", "a2", "b", "c", "cx", "d", "dx", "e"})) {
auto *var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
}
SetOp(&prog, op_name, {"a1", "a2"}, {"b"});
SetOp(&prog, "transpose2", {"b"}, {"c", "cx"});
SetOp(&prog, "reshape2", {"c"}, {"d", "dx"});
SetOp(&prog, "fc", {"d"}, {"e"});
return prog;
}
void MainTest(const ProgramDesc &prog, const std::string &op_name) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int original_nodes_num = graph->Nodes().size();
auto pass =
PassRegistry::Instance().Get("matmul_transpose_reshape_mkldnn_fuse_pass");
graph.reset(pass->Apply(graph.release()));
int current_nodes_num = graph->Nodes().size();
EXPECT_EQ(original_nodes_num - 6, current_nodes_num);
for (auto *node : graph->Nodes()) {
if (node->IsOp()) {
auto *op = node->Op();
if (op->Type() == op_name) {
EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_reshape_Out"),
std::vector<int>({4, 5, 6}));
EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_transpose_Out"),
std::vector<int>({0, 2, 1, 3}));
}
}
}
}
TEST(MatmulTransposeReshapeFusePass, matmul_fuse_pass) {
auto prog = BuildProgramDesc("matmul");
MainTest(prog, "matmul");
}
TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) {
auto prog = BuildProgramDesc("matmul_v2");
MainTest(prog, "matmul_v2");
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(matmul_transpose_reshape_mkldnn_fuse_pass);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
void AddVarToScope(Scope* param_scope,
const std::string& name,
const DDim& dims) {
auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
tensor->Resize(dims);
tensor->mutable_data<float>(phi::CPUPlace());
}
Scope* CreateParamScope() {
auto param_scope = new Scope();
AddVarToScope(param_scope, "w1", {768, 768});
AddVarToScope(param_scope, "bias1", {768});
AddVarToScope(param_scope, "w2", {768, 768});
AddVarToScope(param_scope, "bias2", {768});
return param_scope;
}
void TestMain(const std::string& op_name, bool with_xshapes) {
// inputs operator output
// -----------------------------------------------
// a1,w1,bias1 fc -> b1
// b1 reshape -> c1
// c1 transpose -> d1
// a2,w2,bias2 fc -> b2
// b2 reshape -> c2
// c2 transpose -> d2
// (d1, d2) matmul(_v2) -> (...)
Layers layers;
auto* a1 = layers.data("a1", {-1, 128, 768});
auto* w1 = layers.data("w1", {768, 768}, true);
auto* bias1 = layers.data("bias1", {768}, true);
auto* b1 = layers.fc(a1, w1, bias1, 2);
b1->SetShape({-1, 128, 768});
auto* c1 = layers.reshape2(b1, {0, 0, 12, 64}, with_xshapes);
c1->SetShape({-1, 128, 12, 64});
auto* d1 = layers.transpose2(c1, {0, 2, 1, 3}, with_xshapes);
d1->SetShape({-1, 12, 128, 64});
auto* a2 = layers.data("a2", {-1, 128, 768});
auto* w2 = layers.data("w2", {768, 768}, true);
auto* bias2 = layers.data("bias2", {768}, true);
auto* b2 = layers.fc(a2, w2, bias2, 2);
b2->SetShape({-1, 128, 768});
auto* c2 = layers.reshape2(b2, {0, 0, 12, 64});
c2->SetShape({-1, 128, 12, 64});
auto* d2 = layers.transpose2(c2, {0, 2, 1, 3});
d2->SetShape({-1, 12, 128, 64});
if (op_name == "matmul_v2") {
layers.matmul_v2(d1, d2);
} else {
layers.matmul(d1, d2);
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
graph->Set("__param_scope__", CreateParamScope());
int num_reshape_nodes_before = GetNumOpNodes(graph, "reshape2");
int num_transpose_nodes_before = GetNumOpNodes(graph, "transpose2");
int total_nodes_before = graph->Nodes().size();
VLOG(3) << DebugString(graph);
auto pass =
PassRegistry::Instance().Get("reshape_transpose_matmul_mkldnn_fuse_pass");
graph.reset(pass->Apply(graph.release()));
int num_reshape_nodes_after = GetNumOpNodes(graph, "reshape2");
int num_transpose_nodes_after = GetNumOpNodes(graph, "transpose2");
int total_nodes_after = graph->Nodes().size();
VLOG(3) << DebugString(graph);
EXPECT_EQ(num_reshape_nodes_before, 2);
EXPECT_EQ(num_reshape_nodes_after, 0);
EXPECT_EQ(num_transpose_nodes_before, 2);
EXPECT_EQ(num_transpose_nodes_after, 0);
int removed = 8; // 2* reshape, reshape_out, transpose, transpose_out
if (with_xshapes) removed += 2; // transpose_xshape, reshape_xshape
EXPECT_EQ(total_nodes_before - removed, total_nodes_after);
auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op();
auto check = [&matmul_op_desc](std::string a) {
std::string shape_str = "fused_reshape_" + a;
auto shape = matmul_op_desc->GetAttrIfExists<std::vector<int>>(shape_str);
EXPECT_EQ(shape, (std::vector<int>{0, 0, 12, 64}));
std::string axis_str = "fused_transpose_" + a;
auto axis = matmul_op_desc->GetAttrIfExists<std::vector<int>>(axis_str);
EXPECT_EQ(axis, (std::vector<int>{0, 2, 1, 3}));
};
check("X");
check("Y");
}
TEST(ReshapeTransposeMatmulMkldnnFusePass,
both_matmul_inputs_reshape_transpose) {
TestMain("matmul", false);
}
TEST(ReshapeTransposeMatmulMkldnnFusePass,
both_matmul_inputs_reshape_transpose_one_with_xshapes) {
TestMain("matmul", true);
}
TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
both_matmulv2_inputs_reshape_transpose) {
TestMain("matmul_v2", false);
}
TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
both_matmulv2_inputs_reshape_transpose_one_with_xshapes) {
TestMain("matmul_v2", true);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(reshape_transpose_matmul_mkldnn_fuse_pass);
...@@ -19,14 +19,27 @@ ...@@ -19,14 +19,27 @@
#include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" #include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h"
DECLARE_bool(fast_eager_deletion_mode); DECLARE_bool(fast_eager_deletion_mode);
DECLARE_bool(new_executor_use_cuda_graph);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
bool IsInterpretercoreFastGCEnabled() { bool IsInterpretercoreFastGCEnabled() {
return memory::allocation::AllocatorFacade::Instance() // When using cuda graph, fast GC must be used. Because
.IsStreamSafeCUDAAllocatorUsed() && // `EventQuery` method in event GC cannot be used in
FLAGS_fast_eager_deletion_mode; // cuda graph.
PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() == false &&
FLAGS_new_executor_use_cuda_graph,
false,
platform::errors::InvalidArgument(
"When FLAGS_new_executor_use_cuda_graph is true, "
"IsStreamSafeCUDAAllocatorUsed must be true, but "
"got false."));
return (memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() &&
FLAGS_fast_eager_deletion_mode) ||
FLAGS_new_executor_use_cuda_graph;
} }
InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() { InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(
...@@ -50,6 +51,10 @@ PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor, ...@@ -50,6 +51,10 @@ PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
DECLARE_bool(check_nan_inf); DECLARE_bool(check_nan_inf);
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DECLARE_bool(new_executor_use_cuda_graph);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_bool(sync_nccl_allreduce);
#endif
constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kExceptionCaught = "ExceptionCaught";
constexpr const char* kTaskCompletion = "TaskCompletion"; constexpr const char* kTaskCompletion = "TaskCompletion";
...@@ -142,6 +147,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place, ...@@ -142,6 +147,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
} }
return lhs_prority > rhs_prority; return lhs_prority > rhs_prority;
}; };
PrepareForCUDAGraphCapture();
} }
InterpreterCore::~InterpreterCore() { InterpreterCore::~InterpreterCore() {
...@@ -161,6 +168,7 @@ interpreter::CostInfo InterpreterCore::DryRun( ...@@ -161,6 +168,7 @@ interpreter::CostInfo InterpreterCore::DryRun(
const std::vector<std::string>& feed_names, const std::vector<std::string>& feed_names,
const std::vector<phi::DenseTensor>& feed_tensors) { const std::vector<phi::DenseTensor>& feed_tensors) {
SetDeviceId(place_); SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);
Prepare(feed_names, feed_tensors, true); Prepare(feed_names, feed_tensors, true);
interpreter::CostInfo cost_info; interpreter::CostInfo cost_info;
...@@ -221,6 +229,7 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -221,6 +229,7 @@ paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names, const std::vector<std::string>& feed_names,
const std::vector<phi::DenseTensor>& feed_tensors) { const std::vector<phi::DenseTensor>& feed_tensors) {
SetDeviceId(place_); SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
platform::AttachPointerHashToMKLDNNKey(this, place_); platform::AttachPointerHashToMKLDNNKey(this, place_);
...@@ -240,7 +249,16 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -240,7 +249,16 @@ paddle::framework::FetchList InterpreterCore::Run(
// return Fetch Tensors // return Fetch Tensors
auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName); auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
if (fetch_var) { if (fetch_var) {
return std::move(*fetch_var->GetMutable<framework::FetchList>()); auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_list.empty(),
true,
platform::errors::InvalidArgument(
"Cannot fetch data when using CUDA Graph."));
}
#endif
return fetch_list;
} else { } else {
return {}; return {};
} }
...@@ -249,6 +267,7 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -249,6 +267,7 @@ paddle::framework::FetchList InterpreterCore::Run(
paddle::framework::FetchList InterpreterCore::Run( paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names, bool need_fetch) { const std::vector<std::string>& feed_names, bool need_fetch) {
SetDeviceId(place_); SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
platform::AttachPointerHashToMKLDNNKey(this, place_); platform::AttachPointerHashToMKLDNNKey(this, place_);
...@@ -290,7 +309,16 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -290,7 +309,16 @@ paddle::framework::FetchList InterpreterCore::Run(
HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope(); HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
if (fetch_var && need_fetch) { if (fetch_var && need_fetch) {
return std::move(*fetch_var->GetMutable<framework::FetchList>()); auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_list.empty(),
true,
platform::errors::InvalidArgument(
"Cannot fetch data when using CUDA Graph."));
}
#endif
return fetch_list;
} else { } else {
return {}; return {};
} }
...@@ -504,6 +532,67 @@ void InterpreterCore::BuildInplace() { ...@@ -504,6 +532,67 @@ void InterpreterCore::BuildInplace() {
} }
} }
void InterpreterCore::PrepareForCUDAGraphCapture() {
if (!FLAGS_new_executor_use_cuda_graph) return;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_EQ(
platform::IsCUDAGraphCapturing(),
false,
platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
"when running the first batch."));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
true,
platform::errors::InvalidArgument(
"CUDA Graph is only supported on NVIDIA GPU device."));
// If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
// which may cause error in cuda graph. This behavior is consistent with PE.
PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
false,
platform::errors::InvalidArgument(
"FLAGS_sync_nccl_allreduce must be False to support "
"CUDA Graph capturing."));
// All output vars of coalesce_tensor op should not be gc.
// If fused output var of coalesce_tensor is gc, it will cause accuracy
// problem. The specific reasons need to be analyzed.
for (auto& op_desc : block_.AllOps()) {
if (op_desc->Type() == kCoalesceTensor) {
for (auto& out_var_name : op_desc->OutputArgumentNames()) {
execution_config_.skip_gc_vars.insert(out_var_name);
VLOG(4) << "Insert Var(" << out_var_name << ") into skip_gc_vars.";
}
}
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDA Graph is only supported on NVIDIA GPU device."));
#endif
}
void InterpreterCore::CheckCUDAGraphBeforeRun(
const std::vector<std::string>& feed_names) {
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(
feed_names.empty(),
true,
platform::errors::InvalidArgument(
"Feeding data is not permitted when capturing CUDA Graph."));
PADDLE_ENFORCE_EQ(
FLAGS_new_executor_use_cuda_graph,
true,
platform::errors::InvalidArgument(
"You must turn on FLAGS_new_executor_use_cuda_graph to True "
"to enable CUDA Graph capturing."));
PADDLE_ENFORCE_EQ(
place_,
platform::CUDAGraphCapturingPlace(),
platform::errors::InvalidArgument("The place to capture CUDAGraph is "
"not the same as the place to run."));
}
#endif
}
void InterpreterCore::BuildOperatorDependences() { void InterpreterCore::BuildOperatorDependences() {
// analysis the dependences between ops, add next_instr_list to each instr, // analysis the dependences between ops, add next_instr_list to each instr,
// and set the dependecy_count_ // and set the dependecy_count_
......
...@@ -97,6 +97,10 @@ class InterpreterCore { ...@@ -97,6 +97,10 @@ class InterpreterCore {
const std::vector<std::vector<size_t>>& input_var2op, size_t var_index); const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names); void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
// cuda graph
void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
void PrepareForCUDAGraphCapture();
// execution // execution
void RunImpl(); void RunImpl();
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr); void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
......
...@@ -1561,6 +1561,63 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, ...@@ -1561,6 +1561,63 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
this->Info().infer_shape_(&infer_shape_ctx); this->Info().infer_shape_(&infer_shape_ctx);
} }
template <typename T>
bool HasSameTensorType(phi::TensorBase* phi_tensor, Variable* var) {
if (phi_tensor == nullptr && var == nullptr) {
return true;
} else if (phi_tensor != nullptr && var != nullptr) {
if (T::classof(phi_tensor) && var->IsType<T>()) {
return true;
}
}
return false;
}
// TODO(YuanRisheng): We need collect all `need_prepare_phi_data_`
// into this function.
void OperatorWithKernel::CheckWhetherPreparePhiData(
const VariableNameMap& innames,
const VariableNameMap& outnames,
const Scope& scope) const {
if (run_phi_kernel_ && impl_ != nullptr) {
const auto& phi_kernel_context = impl_->getKernelContext();
size_t phi_tensor_index = 0;
// Check each tensor in KernelContext, if there is a tensor that has
// different type with variable. The PhiKernelContext need be reconstructed.
// We use kernel_signature_'s output to retrieve tensor. Because the tensor
// in phi_kernel_context stored in the order of kernel_signature_'s output.
if (phi_kernel_context->OutputsSize() >= phi_tensor_index ||
kernel_signature_ == nullptr) {
need_prepare_phi_data_ = true;
return;
}
const auto& phi_output_names = kernel_signature_->output_names;
for (auto& phi_output_name : phi_output_names) {
const auto& iter = outnames.find(phi_output_name);
if (iter != outnames.end()) {
for (auto& var_name : iter->second) {
auto var_output = scope.FindVar(var_name);
auto phi_output =
phi_kernel_context->MutableOutputAt<phi::TensorBase>(
phi_tensor_index);
if (phi_output == nullptr) {
continue;
}
if (!(HasSameTensorType<phi::DenseTensor>(phi_output, var_output) ||
HasSameTensorType<phi::SparseCooTensor>(phi_output,
var_output) ||
HasSameTensorType<framework::Strings>(phi_output,
var_output))) {
need_prepare_phi_data_ = true;
}
phi_tensor_index++;
}
}
}
}
}
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
// To reduce the elapsed time of HasAttr, we use bool variable to record the // To reduce the elapsed time of HasAttr, we use bool variable to record the
...@@ -1571,6 +1628,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1571,6 +1628,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
HasAttr(kAllKernelsMustComputeRuntimeShape)) HasAttr(kAllKernelsMustComputeRuntimeShape))
all_kernels_must_compute_runtime_shape_ = true; all_kernels_must_compute_runtime_shape_ = true;
const Scope* cur_scope = &scope; const Scope* cur_scope = &scope;
CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
if (!enable_cache_runtime_context_) { if (!enable_cache_runtime_context_) {
RuntimeContext ctx(Inputs(), Outputs(), scope); RuntimeContext ctx(Inputs(), Outputs(), scope);
RunImpl(scope, place, &ctx); RunImpl(scope, place, &ctx);
...@@ -2993,7 +3051,6 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2993,7 +3051,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
"to the size of kernel attribute_defs (%d).", "to the size of kernel attribute_defs (%d).",
attr_names.size(), attr_names.size(),
attr_defs.size())); attr_defs.size()));
for (size_t i = 0; i < input_names.size(); ++i) { for (size_t i = 0; i < input_names.size(); ++i) {
auto it = ctx.inputs.find(input_names[i]); auto it = ctx.inputs.find(input_names[i]);
...@@ -3037,6 +3094,9 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -3037,6 +3094,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
} else if (var->IsType<framework::Vocab>()) { } else if (var->IsType<framework::Vocab>()) {
tensor_in = &(var->Get<framework::Vocab>()); tensor_in = &(var->Get<framework::Vocab>());
phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
} else if (var->IsType<framework::FeedList>()) {
tensor_in = &(var->Get<framework::FeedList>());
phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported input `%s` type when call pt kernel.", "Unsupported input `%s` type when call pt kernel.",
...@@ -3047,7 +3107,6 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -3047,7 +3107,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
phi_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); phi_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
} }
VLOG(4) << "Done inputs"; VLOG(4) << "Done inputs";
for (size_t i = 0; i < output_names.size(); ++i) { for (size_t i = 0; i < output_names.size(); ++i) {
auto it = ctx.outputs.find(output_names[i]); auto it = ctx.outputs.find(output_names[i]);
size_t start_idx = size_t start_idx =
...@@ -3087,6 +3146,9 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -3087,6 +3146,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
// Note: If the input LoDTensorArray size is 0, the output // Note: If the input LoDTensorArray size is 0, the output
// LoDTensorArray is also 0 // LoDTensorArray is also 0
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} else if (var->template IsType<framework::Strings>()) {
tensor_out = var->template GetMutable<framework::Strings>();
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} else if (var->template IsType<paddle::framework::RawTensor>()) { } else if (var->template IsType<paddle::framework::RawTensor>()) {
tensor_out = var->template GetMutable<paddle::framework::RawTensor>(); tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
...@@ -3108,7 +3170,6 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -3108,7 +3170,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
i); i);
} }
VLOG(4) << "Done outputs"; VLOG(4) << "Done outputs";
for (size_t i = 0; i < attr_names.size(); ++i) { for (size_t i = 0; i < attr_names.size(); ++i) {
VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": " VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": "
<< attr_defs[i].type_index; << attr_defs[i].type_index;
......
...@@ -550,6 +550,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { ...@@ -550,6 +550,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
return var->IsType<phi::SparseCooTensor>(); return var->IsType<phi::SparseCooTensor>();
} }
bool IsSparseCooTensorOutput(const std::string& name) const override {
auto vars = ctx_.MultiOutputVar(name);
return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
return var->IsType<phi::SparseCooTensor>();
});
}
bool IsSparseCsrTensorInput(const std::string& name) const override { bool IsSparseCsrTensorInput(const std::string& name) const override {
const auto* var = ctx_.InputVar(name); const auto* var = ctx_.InputVar(name);
return var->IsType<phi::SparseCsrTensor>(); return var->IsType<phi::SparseCsrTensor>();
...@@ -746,6 +753,10 @@ class OperatorWithKernel : public OperatorBase { ...@@ -746,6 +753,10 @@ class OperatorWithKernel : public OperatorBase {
RuntimeContext* ctx, RuntimeContext* ctx,
const phi::Place& place) const; const phi::Place& place) const;
void CheckWhetherPreparePhiData(const VariableNameMap& innames,
const VariableNameMap& outnames,
const Scope& scope) const;
void TransferInplaceVarsBack(const Scope& scope, void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars, const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const; const Scope& exec_scope) const;
......
...@@ -60,6 +60,7 @@ if(WITH_TESTING) ...@@ -60,6 +60,7 @@ if(WITH_TESTING)
elementwise_add_op elementwise_add_op
generated_op) generated_op)
set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES})
cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS
transform_desc) transform_desc)
......
...@@ -23,8 +23,8 @@ namespace paddle { ...@@ -23,8 +23,8 @@ namespace paddle {
namespace framework { namespace framework {
/// \brief Fluid Kernel and PHI Kernel will be unified in the future. /// \brief Fluid Kernel and PHI Kernel will be unified in the future.
/// So, we need a class in PHI that can represent the RAW type in Fluid. /// So, we need a class in PHI that can represent the RawTensor type in Fluid.
/// The RawTensor is for PHI Kernel that has RAW type arguments. /// The RawTensor is for PHI Kernel that has RawTensor type arguments.
class RawTensor : public phi::ExtendedTensor, class RawTensor : public phi::ExtendedTensor,
public phi::TypeInfoTraits<phi::TensorBase, RawTensor> { public phi::TypeInfoTraits<phi::TensorBase, RawTensor> {
public: public:
...@@ -37,13 +37,35 @@ class RawTensor : public phi::ExtendedTensor, ...@@ -37,13 +37,35 @@ class RawTensor : public phi::ExtendedTensor,
RawTensor& operator=(RawTensor&& other) = default; RawTensor& operator=(RawTensor&& other) = default;
/// \brief Destroy the RawTensor and release exclusive resources. /// \brief Destroy the RawTensor and release exclusive resources.
virtual ~RawTensor() = default; virtual ~RawTensor() {
if (!data_.empty()) {
data_deleter_();
}
}
public: public:
/// \brief Returns the name of the class for type traits. /// \brief Returns the name of the class for type traits.
/// \return The name of the class. /// \return The name of the class.
static const char* name() { return "RawTensor"; } static const char* name() { return "RawTensor"; }
template <typename T>
T& Get() const {
PADDLE_ENFORCE_EQ(data_.empty(),
false,
platform::errors::PreconditionNotMet(
"The data in RawTensor is empty. Please set data "
"before using it."));
try {
return *(paddle::any_cast<T*>(data_));
} catch (paddle::bad_any_cast&) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Invalid data type error, expected %s, actual %s.",
typeid(T).name(),
data_type_.name()));
}
}
template <typename T> template <typename T>
T* GetMutable() { T* GetMutable() {
if (!data_.empty()) { if (!data_.empty()) {
...@@ -70,7 +92,7 @@ class RawTensor : public phi::ExtendedTensor, ...@@ -70,7 +92,7 @@ class RawTensor : public phi::ExtendedTensor,
private: private:
paddle::any data_; paddle::any data_;
std::function<void(void)> data_deleter_; std::function<void(void)> data_deleter_ = []() {};
std::type_index data_type_ = std::type_index(typeid(void)); std::type_index data_type_ = std::type_index(typeid(void));
}; };
......
...@@ -25,6 +25,10 @@ limitations under the License. */ ...@@ -25,6 +25,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't
// recommend widely use it. Because faster_tokenizer_op may be deleted in the
// future and this class will be deleted.
class Vocab : public phi::ExtendedTensor, class Vocab : public phi::ExtendedTensor,
public phi::TypeInfoTraits<phi::TensorBase, Vocab> { public phi::TypeInfoTraits<phi::TensorBase, Vocab> {
public: public:
...@@ -94,8 +98,73 @@ class Vocab : public phi::ExtendedTensor, ...@@ -94,8 +98,73 @@ class Vocab : public phi::ExtendedTensor,
std::unordered_map<std::wstring, std::int32_t> data_; std::unordered_map<std::wstring, std::int32_t> data_;
}; };
// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI
// Kernel. It can be used when you define a non-tensor type that needs to be
// stored in a vector as PHI kernel argument.
template <typename T>
class PhiVector : public phi::ExtendedTensor,
public phi::TypeInfoTraits<phi::TensorBase, PhiVector<T>> {
public:
PhiVector() = default;
explicit PhiVector(const std::vector<T>& init_data) : data_(init_data) {}
PhiVector(PhiVector&& other) = default;
PhiVector(const PhiVector& other) = default;
PhiVector& operator=(const PhiVector& other) = default;
PhiVector& operator=(const std::vector<T>& other) {
data_ = other;
return *this;
}
PhiVector& operator=(PhiVector&& other) = default;
/// \brief Destroy the PhiVector and release exclusive resources.
virtual ~PhiVector() = default;
public:
/// \brief Returns the name of the class for type traits.
/// \return The name of the class.
static const char* name() {
return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str();
}
size_t size() const { return data_.size(); }
void resize(size_t size) { data_.resize(size); }
void clear() { data_.clear(); }
void emplace_back(const T& feed_data) { data_.emplace_back(feed_data); }
const T& operator[](size_t index) const { return data_[index]; }
T& operator[](size_t index) { return data_[index]; }
T& at(size_t index) { return data_.at(index); }
const T& at(size_t index) const { return data_.at(index); }
typename std::vector<T>::iterator begin() { return data_.begin(); }
typename std::vector<T>::const_iterator begin() const {
return data_.begin();
}
typename std::vector<T>::iterator end() { return data_.end(); }
typename std::vector<T>::const_iterator end() const { return data_.end(); }
private:
std::vector<T> data_;
};
using String = std::string; using String = std::string;
using Strings = std::vector<std::string>; using Strings = PhiVector<std::string>;
// Convert the std::string type to the std::string type. // Convert the std::string type to the std::string type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res); bool ConvertStrToWstr(const std::string& src, std::wstring* res);
......
...@@ -221,6 +221,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< ...@@ -221,6 +221,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
Vocab, Vocab,
std::vector<int>, std::vector<int>,
std::vector<float>, std::vector<float>,
std::vector<std::string>,
RawTensor>; RawTensor>;
template <typename T> template <typename T>
struct VarTypeTrait { struct VarTypeTrait {
......
...@@ -1655,7 +1655,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( ...@@ -1655,7 +1655,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
auto custom_place = place_; auto custom_place = place_;
auto paddleplace = static_cast<PaddlePlace>( auto paddleplace = static_cast<PaddlePlace>(
static_cast<size_t>(PaddlePlace::kCUSTOM) + static_cast<size_t>(PaddlePlace::kCUSTOM) +
phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
res->SetPlace(paddleplace, custom_place.GetDeviceId()); res->SetPlace(paddleplace, custom_place.GetDeviceId());
} else { } else {
auto gpu_place = place_; auto gpu_place = place_;
...@@ -1710,7 +1711,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -1710,7 +1711,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
auto custom_place = place_; auto custom_place = place_;
auto paddleplace = static_cast<PaddlePlace>( auto paddleplace = static_cast<PaddlePlace>(
static_cast<size_t>(PaddlePlace::kCUSTOM) + static_cast<size_t>(PaddlePlace::kCUSTOM) +
phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
res->SetPlace(paddleplace, custom_place.GetDeviceId()); res->SetPlace(paddleplace, custom_place.GetDeviceId());
} else { } else {
auto gpu_place = place_; auto gpu_place = place_;
......
...@@ -25,13 +25,16 @@ if(WITH_ONNXRUNTIME) ...@@ -25,13 +25,16 @@ if(WITH_ONNXRUNTIME)
cc_library( cc_library(
zero_copy_tensor_dummy zero_copy_tensor_dummy
SRCS zero_copy_tensor_dummy.cc SRCS zero_copy_tensor_dummy.cc
DEPS onnxruntime) DEPS onnxruntime phi_enforce)
else() else()
cc_library( cc_library(
zero_copy_tensor zero_copy_tensor
SRCS zero_copy_tensor.cc SRCS zero_copy_tensor.cc
DEPS scope lod_tensor enforce) DEPS scope lod_tensor enforce)
cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) cc_library(
zero_copy_tensor_dummy
SRCS zero_copy_tensor_dummy.cc
DEPS phi_enforce)
endif() endif()
cc_test( cc_test(
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_tensor.h" #include "paddle/fluid/inference/api/paddle_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
...@@ -76,7 +77,8 @@ void Tensor::ReshapeStrings(const size_t &shape) { ...@@ -76,7 +77,8 @@ void Tensor::ReshapeStrings(const size_t &shape) {
var, var,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_)); "No tensor called [%s] in the runtime scope", name_));
paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>(); paddle::framework::Strings *tensor =
var->GetMutable<paddle::framework::Strings>();
tensor->resize(shape); tensor->resize(shape);
} }
...@@ -261,7 +263,9 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -261,7 +263,9 @@ void Tensor::CopyFromCpu(const T *data) {
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
paddle::platform::CustomPlace custom_place( paddle::platform::CustomPlace custom_place(
phi::GetGlobalDeviceType(device_type_id), device_); phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id),
device_);
auto *t_data = tensor->mutable_data<T>(custom_place); auto *t_data = tensor->mutable_data<T>(custom_place);
auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
pool.Get(custom_place)); pool.Get(custom_place));
...@@ -354,7 +358,7 @@ void Tensor::ShareExternalData(const T *data, ...@@ -354,7 +358,7 @@ void Tensor::ShareExternalData(const T *data,
} }
void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
EAGER_GET_TENSOR(paddle_infer::Strings); EAGER_GET_TENSOR(paddle::framework::Strings);
PADDLE_ENFORCE_GE(tensor->size(), PADDLE_ENFORCE_GE(tensor->size(),
0, 0,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
......
...@@ -112,6 +112,12 @@ bool PluginArgumentMappingContext::IsSparseCooTensorInput( ...@@ -112,6 +112,12 @@ bool PluginArgumentMappingContext::IsSparseCooTensorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
} }
bool PluginArgumentMappingContext::IsSparseCooTensorOutput(
const std::string& name) const {
return false;
}
bool PluginArgumentMappingContext::IsSparseCsrTensorInput( bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
......
...@@ -56,6 +56,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext { ...@@ -56,6 +56,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
bool IsDenseTensorOutput(const std::string& name) const override; bool IsDenseTensorOutput(const std::string& name) const override;
bool IsSparseCooTensorOutput(const std::string& name) const override;
bool IsSelectedRowsOutput(const std::string& name) const override; bool IsSelectedRowsOutput(const std::string& name) const override;
bool IsForInferShape() const override { return false; } bool IsForInferShape() const override { return false; }
......
...@@ -124,6 +124,7 @@ TEST(ArgMappingContexTest, BasicFunction) { ...@@ -124,6 +124,7 @@ TEST(ArgMappingContexTest, BasicFunction) {
EXPECT_EQ(context.IsDenseTensorOutput("Out"), false); EXPECT_EQ(context.IsDenseTensorOutput("Out"), false);
EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false); EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false);
EXPECT_EQ(context.IsSparseCooTensorOutput("Out"), false);
EXPECT_EQ(context.IsForInferShape(), false); EXPECT_EQ(context.IsForInferShape(), false);
} }
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DECLARE_bool(use_shm_cache);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
...@@ -111,20 +113,33 @@ void AllocateMemoryMap( ...@@ -111,20 +113,33 @@ void AllocateMemoryMap(
std::shared_ptr<RefcountedMemoryMapAllocation> std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename, AllocateRefcountedMemoryMapAllocation(std::string filename,
int flags, int flags,
size_t size) { size_t size,
int buffer_id) {
int fd = -1; int fd = -1;
void *base_ptr = nullptr; void *base_ptr = nullptr;
AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); if (buffer_id == -1) {
AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
VLOG(4) << "Create and mmap a new shm: " << filename;
} else {
base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_;
VLOG(4) << "Get a cached shm " << filename;
}
void *aliged_base_ptr = void *aliged_base_ptr =
static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment); static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
return std::make_shared<RefcountedMemoryMapAllocation>( return std::make_shared<RefcountedMemoryMapAllocation>(
aliged_base_ptr, size, filename, flags, fd); aliged_base_ptr, size, filename, flags, fd, buffer_id);
} }
RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
void *ptr, size_t size, std::string ipc_name, int fd, int flags) void *ptr,
size_t size,
std::string ipc_name,
int fd,
int flags,
int buffer_id)
: MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
// must reset base ptr first. // must reset base ptr first.
buffer_id_ = buffer_id;
resetBaseptr(); resetBaseptr();
initializeRefercount(); initializeRefercount();
} }
...@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() { ...@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() {
} }
void RefcountedMemoryMapAllocation::close() { void RefcountedMemoryMapAllocation::close() {
VLOG(4) << "Close a RefcountedMemoryMapAllocation: " << ipc_name_;
if (closed_) { if (closed_) {
return; return;
} }
closed_ = true; closed_ = true;
void *data = map_ptr_; void *data = map_ptr_;
CountInfo *info = reinterpret_cast<CountInfo *>(data); CountInfo *info = reinterpret_cast<CountInfo *>(data);
if (--info->refcount == 0) { --info->refcount;
shm_unlink(ipc_name_.c_str()); if (FLAGS_use_shm_cache && buffer_id_ != -1) {
VLOG(6) << "shm_unlink file: " << ipc_name_; return;
} else {
if (FLAGS_use_shm_cache &&
MemoryMapAllocationPool::Instance().BufferSize() <
static_cast<size_t>(
MemoryMapAllocationPool::Instance().MaxPoolSize())) {
MemoryMapAllocationPool::Instance().Insert(MemoryMapInfo(
flags_, map_size_ - mmap_alignment, ipc_name_, map_ptr_));
} else {
if (info->refcount == 0 &&
shm_open(ipc_name_.c_str(), O_RDWR, (mode_t)0600) != -1) {
shm_unlink(ipc_name_.c_str());
VLOG(6) << "shm_unlink file: " << ipc_name_;
}
PADDLE_ENFORCE_NE(munmap(map_ptr_, map_size_),
-1,
platform::errors::Unavailable(
"could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
}
} }
PADDLE_ENFORCE_NE(
munmap(map_ptr_, map_size_),
-1,
platform::errors::Unavailable("could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
} }
MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
...@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() { ...@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() {
MemoryMapFdSet::~MemoryMapFdSet() { Clear(); } MemoryMapFdSet::~MemoryMapFdSet() { Clear(); }
MemoryMapAllocationPool *MemoryMapAllocationPool::pool_ = nullptr;
void MemoryMapAllocationPool::Insert(const MemoryMapInfo &memory_map) {
std::lock_guard<std::mutex> guard(mtx_);
memory_map_allocations_.push_back(memory_map);
VLOG(4) << this << "Intsert a new shm: " << memory_map.file_name_;
}
int MemoryMapAllocationPool::FindFromCache(const int &flag,
const size_t &data_size,
const std::string &file_name,
bool check_refcount) {
std::lock_guard<std::mutex> guard(mtx_);
for (size_t idx = 0; idx < memory_map_allocations_.size(); idx++) {
if (memory_map_allocations_.at(idx).flags_ == flag &&
memory_map_allocations_.at(idx).data_size_ == data_size) {
if (file_name == "" ||
memory_map_allocations_.at(idx).file_name_ == file_name) {
if (!check_refcount || reinterpret_cast<CountInfo *>(
memory_map_allocations_.at(idx).mmap_ptr_)
->refcount == 0) {
VLOG(4) << "Match at: " << idx;
return idx;
}
}
}
}
return -1;
}
const MemoryMapInfo &MemoryMapAllocationPool::GetById(int id) {
std::lock_guard<std::mutex> guard(mtx_);
return memory_map_allocations_.at(id);
}
void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) {
max_pool_size_ = size;
VLOG(4) << this << "Set max pool size is: " << max_pool_size_;
}
void MemoryMapAllocationPool::Clear() {
std::lock_guard<std::mutex> guard(mtx_);
for (auto mmap : memory_map_allocations_) {
int rlt = shm_unlink(mmap.file_name_.c_str());
if (rlt == 0) {
VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_;
}
PADDLE_ENFORCE_NE(munmap(mmap.mmap_ptr_, mmap.data_size_ + mmap_alignment),
-1,
platform::errors::Unavailable(
"could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
}
memory_map_allocations_.clear();
}
MemoryMapAllocationPool::~MemoryMapAllocationPool() { Clear(); }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
......
...@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation { ...@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation {
class RefcountedMemoryMapAllocation : public MemoryMapAllocation { class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
public: public:
RefcountedMemoryMapAllocation( RefcountedMemoryMapAllocation(void *ptr,
void *ptr, size_t size, std::string ipc_name, int flags, int fd); size_t size,
std::string ipc_name,
int flags,
int fd,
int buffer_id = -1);
void incref(); void incref();
int decref(); int decref();
...@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation { ...@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
virtual ~RefcountedMemoryMapAllocation() { close(); } virtual ~RefcountedMemoryMapAllocation() { close(); }
protected: protected:
int buffer_id_ = -1;
void initializeRefercount(); void initializeRefercount();
void resetBaseptr(); void resetBaseptr();
}; };
...@@ -94,7 +99,8 @@ void AllocateMemoryMap( ...@@ -94,7 +99,8 @@ void AllocateMemoryMap(
std::shared_ptr<RefcountedMemoryMapAllocation> std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename, AllocateRefcountedMemoryMapAllocation(std::string filename,
int flags, int flags,
size_t size); size_t size,
int buffer_id = -1);
class MemoryMapWriterAllocation : public Allocation { class MemoryMapWriterAllocation : public Allocation {
public: public:
...@@ -153,6 +159,68 @@ class MemoryMapFdSet { ...@@ -153,6 +159,68 @@ class MemoryMapFdSet {
std::mutex mtx_; std::mutex mtx_;
}; };
class MemoryMapInfo {
public:
explicit MemoryMapInfo(int flags,
size_t data_size,
std::string file_name,
void *mmap_ptr)
: flags_(flags),
data_size_(data_size),
file_name_(file_name),
mmap_ptr_(mmap_ptr) {}
int flags_ = 0;
size_t data_size_ = 0;
std::string file_name_;
void *mmap_ptr_ = nullptr;
};
/* Note(zhangbo):
MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in
dataloader. The munmap(shm_mmap_ptr) instruction in
RefcountedMemoryMapAllocation::close() function may block other threads of the
process. Therefore, the logic of shm cache and reuse is designed: the shm
created by the _share_filename process will be cached and reused according to
the data_size of shm, thus eliminating the problem of munmap blocking other
threads
*/
class MemoryMapAllocationPool {
public:
static MemoryMapAllocationPool &Instance() {
if (pool_ == nullptr) {
pool_ = new MemoryMapAllocationPool();
}
return *pool_;
}
void Insert(const MemoryMapInfo &memory_map);
int FindFromCache(const int &flag,
const size_t &data_size,
const std::string &file_name = "",
bool check_refcount = true);
const MemoryMapInfo &GetById(int id);
size_t BufferSize() { return memory_map_allocations_.size(); }
void Clear();
void SetMaxPoolSize(const int &size);
int MaxPoolSize() { return max_pool_size_; }
~MemoryMapAllocationPool();
private:
MemoryMapAllocationPool() = default;
static MemoryMapAllocationPool *pool_;
std::vector<MemoryMapInfo> memory_map_allocations_;
int max_pool_size_ = 0;
std::mutex mtx_;
};
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
......
...@@ -44,6 +44,7 @@ if(WITH_TESTING) ...@@ -44,6 +44,7 @@ if(WITH_TESTING)
cinn_launch_context cinn_launch_context
cinn_instruction_run_op cinn_instruction_run_op
cinn) cinn)
target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
set_tests_properties(cinn_launch_context_test PROPERTIES LABELS set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
"RUN_TYPE=CINN") "RUN_TYPE=CINN")
...@@ -73,6 +74,7 @@ if(WITH_TESTING) ...@@ -73,6 +74,7 @@ if(WITH_TESTING)
cinn_launch_op cinn_launch_op
cinn_instruction_run_op cinn_instruction_run_op
elementwise_add_op) elementwise_add_op)
target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
set_tests_properties( set_tests_properties(
cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
"${CINN_RUN_ENVIRONMENT}") "${CINN_RUN_ENVIRONMENT}")
......
...@@ -11,6 +11,8 @@ limitations under the License. */ ...@@ -11,6 +11,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/raw_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
namespace paddle { namespace paddle {
...@@ -28,117 +30,128 @@ class OpBase; ...@@ -28,117 +30,128 @@ class OpBase;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// FeedVariableVisitor is to feed the variable data const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x,
// according to data type (phi::DenseTensor or Strings). int col) {
class FeedVariableVisitor { PADDLE_ENFORCE_GE(col,
public: 0,
explicit FeedVariableVisitor(framework::Variable *out_var, platform::errors::InvalidArgument(
const platform::Place &place) "Expected the column index (the attribute 'col' of "
: out_var_(out_var), place_(place) {} "operator 'Feed') of current feeding variable to be "
"no less than 0. But received column index = %d.",
void operator()(const phi::DenseTensor &in_tensor) const { col));
phi::DenseTensor *out_tensor = out_var_->GetMutable<phi::DenseTensor>(); auto feed_list = static_cast<const paddle::framework::FeedList*>(&x);
if (platform::is_same_place(in_tensor.place(), place_)) { PADDLE_ENFORCE_LT(
out_tensor->ShareDataWith(in_tensor); static_cast<size_t>(col),
#ifdef PADDLE_WITH_IPU feed_list->size(),
} else if (platform::is_ipu_place(place_)) { platform::errors::InvalidArgument(
// For ipu, both in_tensor and out_tensor are allocated on cpu, "The column index of current feeding variable is expected to be "
// PopART will copy tensor from host automatically, "less than the length of feeding list. But received column index = "
// no TensorCopy() is required here. "%d, the length of feeding list = %d",
out_tensor->ShareDataWith(in_tensor); col,
#endif feed_list->size()));
} else {
platform::DeviceContext *context = return feed_list->at(static_cast<size_t>(col));
platform::DeviceContextPool::Instance().Get(place_); }
framework::TensorCopy(in_tensor, place_, *context, out_tensor);
} template <typename Context>
out_tensor->set_lod(in_tensor.lod()); void FeedDenseTensorKernel(const Context& dev_ctx,
const phi::ExtendedTensor& x,
int col,
phi::DenseTensor* out) {
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"Output cannot be found in scope for operator 'Feed'"));
const auto& feed_item = CheckAndGetFeedItem(x, col);
const auto& in_tensor = paddle::get<phi::DenseTensor>(feed_item);
const auto& place = dev_ctx.GetPlace();
if (platform::is_same_place(in_tensor.place(), place)) {
out->ShareDataWith(in_tensor);
} else {
framework::TensorCopy(in_tensor, place, dev_ctx, out);
} }
void operator()(const framework::Strings &in_str) const { out->set_lod(in_tensor.lod());
framework::Strings *out_str = out_var_->GetMutable<framework::Strings>(); }
out_str->resize(in_str.size());
*out_str = in_str; template <typename Context>
void FeedSparseCooTensorKernel(const Context& dev_ctx,
const phi::ExtendedTensor& x,
int col,
phi::SparseCooTensor* out) {
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"Output cannot be found in scope for operator 'Feed'"));
const auto& feed_item = CheckAndGetFeedItem(x, col);
const auto& in_tensor = paddle::get<phi::SparseCooTensor>(feed_item);
const auto& place = dev_ctx.GetPlace();
if (platform::is_same_place(in_tensor.place(), place)) {
*out = in_tensor;
} else {
phi::DenseTensor indices, values;
framework::TensorCopy(in_tensor.indices(), place, dev_ctx, &indices);
framework::TensorCopy(in_tensor.values(), place, dev_ctx, &values);
out->SetMember(indices, values, in_tensor.meta());
} }
}
template <typename Context>
void FeedStringsKernel(const Context& dev_ctx,
const phi::ExtendedTensor& x,
int col,
phi::ExtendedTensor* out) {
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"Output cannot be found in scope for operator 'Feed'"));
const auto& feed_item = CheckAndGetFeedItem(x, col);
auto strs_out = static_cast<framework::Strings*>(out);
const auto& in_str = paddle::get<framework::Strings>(feed_item);
strs_out->resize(in_str.size());
*strs_out = in_str;
}
class FeedOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void operator()(const phi::SparseCooTensor &in_tensor) const { void InferShape(framework::InferShapeContext* ctx) const override {
phi::SparseCooTensor *out_tensor = OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "feed");
out_var_->GetMutable<phi::SparseCooTensor>(); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "feed");
if (platform::is_same_place(in_tensor.place(), place_)) { if (ctx->IsRuntime()) {
*out_tensor = in_tensor; framework::Variable* x_var =
} else { PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
platform::DeviceContext *context = auto& x = x_var->Get<framework::FeedList>();
platform::DeviceContextPool::Instance().Get(place_); int col = ctx->Attrs().Get<int>("col");
auto& feed_item = x[col];
phi::DenseTensor indices, values; if (feed_item.index() == 0) {
framework::TensorCopy(in_tensor.indices(), place_, *context, &indices); const auto& feed_item = CheckAndGetFeedItem(x, col);
framework::TensorCopy(in_tensor.values(), place_, *context, &values); auto& feed_tensor = PADDLE_GET_CONST(phi::DenseTensor, feed_item);
out_tensor->SetMember(indices, values, in_tensor.meta()); ctx->SetOutputDim("Out", feed_tensor.dims());
} else if (feed_item.index() == 1) {
auto& feed_str = PADDLE_GET_CONST(framework::Strings, feed_item);
framework::Variable* out_var =
PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
out_var->GetMutable<framework::Strings>()->resize(feed_str.size());
} else {
auto& feed_sparse_tensor =
PADDLE_GET_CONST(phi::SparseCooTensor, feed_item);
framework::Variable* out_var =
PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
out_var->GetMutable<phi::SparseCooTensor>()->set_meta(
feed_sparse_tensor.meta());
out_var->GetMutable<phi::SparseCooTensor>()->SetCoalesced(
feed_sparse_tensor.coalesced());
out_var->GetMutable<phi::SparseCooTensor>()->SetIndicesDict(
feed_sparse_tensor.GetIndicesDict());
}
} }
} }
private: protected:
framework::Variable *out_var_; phi::KernelKey GetExpectedKernelType(
const platform::Place &place_; const framework::ExecutionContext& ctx) const override {
}; return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
class FeedOp : public framework::OperatorBase {
public:
FeedOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
OP_INOUT_CHECK(HasInputs("X"), "Input", "X", "Feed");
OP_INOUT_CHECK(HasOutputs("Out"), "Output", "Out", "Feed");
auto feed_var_name = Input("X");
auto *feed_var = scope.FindVar(feed_var_name);
PADDLE_ENFORCE_NOT_NULL(
feed_var,
platform::errors::NotFound(
"Input varibale(%s) cannot be found in scope for operator 'Feed'.",
feed_var_name));
auto out_name = this->Output("Out");
auto *out_var = scope.FindVar(out_name);
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"Output variable(%s) cannot be found in scope for operator 'Feed'",
out_name));
auto col = Attr<int>("col");
PADDLE_ENFORCE_GE(col,
0,
platform::errors::InvalidArgument(
"Expected the column index (the attribute 'col' of "
"operator 'Feed') of current feeding variable to be "
"no less than 0. But received column index = %d.",
col));
VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
<< " column to variable " << out_name;
auto &feed_list = feed_var->Get<framework::FeedList>();
PADDLE_ENFORCE_LT(
static_cast<size_t>(col),
feed_list.size(),
platform::errors::InvalidArgument(
"The column index of current feeding variable is expected to be "
"less than the length of feeding list. But received column index = "
"%d, the length of feeding list = %d",
col,
feed_list.size()));
auto &feed_item = feed_list.at(static_cast<size_t>(col));
FeedVariableVisitor visitor(out_var, place);
paddle::visit(visitor, feed_item);
} }
}; };
...@@ -164,9 +177,152 @@ It should not be configured by users directly. ...@@ -164,9 +177,152 @@ It should not be configured by users directly.
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
// TODO(YuanRisheng): Maybe we need design a new registry macro for
// registering device independent kernels.
REGISTER_OPERATOR( REGISTER_OPERATOR(
feed, feed,
paddle::operators::FeedOp, paddle::operators::FeedOp,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
paddle::operators::FeedOpInfoMaker); paddle::operators::FeedOpInfoMaker);
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
CPU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
CPU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
CPU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CPUContext>,
ALL_DTYPE) {}
#if defined(PADDLE_WITH_MKLDNN)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
OneDNN,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::OneDNNContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
OneDNN,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::OneDNNContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
OneDNN,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::OneDNNContext>,
ALL_DTYPE) {}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
GPU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::GPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
GPU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::GPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
GPU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::GPUContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_XPU)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
XPU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::XPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
XPU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::XPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
XPU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::XPUContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_ASCEND_CL)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
npu,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
npu,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
npu,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CustomContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_MLU)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
CustomMLU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
CustomMLU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
CustomMLU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CustomContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
custom_cpu,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
custom_cpu,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
custom_cpu,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CustomContext>,
ALL_DTYPE) {}
#endif
...@@ -33,6 +33,27 @@ class CumOp : public framework::OperatorWithKernel { ...@@ -33,6 +33,27 @@ class CumOp : public framework::OperatorWithKernel {
} }
}; };
class CumGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cumsum");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
"Input",
"Out@GRAD",
"cumsum");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
return phi::KernelKey(input_data_type, ctx.GetPlace());
}
};
class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -69,12 +90,13 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -69,12 +90,13 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
protected: protected:
void Apply(GradOpPtr<T> grad_op) const override { void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("cumsum"); grad_op->SetType("cumsum_grad");
grad_op->SetInput("X", this->OutputGrad("Out")); grad_op->SetInput("X", this->Input("X"));
grad_op->SetOutput("Out", this->InputGrad("X")); grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs()); grad_op->SetAttrMap(this->Attrs());
grad_op->SetAttr("reverse", grad_op->SetAttr("reverse",
!PADDLE_GET_CONST(bool, this->GetAttr("reverse"))); PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
} }
}; };
...@@ -153,6 +175,7 @@ using CPU = phi::CPUContext; ...@@ -153,6 +175,7 @@ using CPU = phi::CPUContext;
DECLARE_INFER_SHAPE_FUNCTOR(cumsum, DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
CumsumInferShapeFunctor, CumsumInferShapeFunctor,
PD_INFER_META(phi::CumScalarAxisInferMeta)); PD_INFER_META(phi::CumScalarAxisInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp, DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp,
LogcumsumexpInferShapeFunctor, LogcumsumexpInferShapeFunctor,
PD_INFER_META(phi::CumInferMeta)); PD_INFER_META(phi::CumInferMeta));
...@@ -169,6 +192,7 @@ REGISTER_OPERATOR(logcumsumexp, ...@@ -169,6 +192,7 @@ REGISTER_OPERATOR(logcumsumexp,
ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>, ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>,
LogcumsumexpInferShapeFunctor); LogcumsumexpInferShapeFunctor);
REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp); REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp);
REGISTER_OPERATOR(cumsum_grad, ops::CumGradOp);
REGISTER_OP_VERSION(cumsum).AddCheckpoint( REGISTER_OP_VERSION(cumsum).AddCheckpoint(
R"ROC( R"ROC(
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
class SlogDeterminantOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
}
};
class SlogDeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input", "(Tensor) The input tensor of SlogDeterminant.");
AddOutput("Out",
"(Tensor) The output tensor containing the sign of the"
"determinant and the natural logarithm"
"of the absolute value of determinant,");
AddComment(R"DOC(
SlogDeterminant Operator.)DOC");
}
};
class SlogDeterminantGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(
ctx->HasInput("Input"), "Input", "Input", "SlogDeterminantGradOp");
OP_INOUT_CHECK(
ctx->HasInput("Out"), "Input", "Out", "SlogDeterminantGradOp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
"Input",
framework::GradVarName("Out"),
"SlogDeterminantGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")),
"Output",
framework::GradVarName("Input"),
"SlogDeterminantGradOp");
ctx->SetOutputDim(framework::GradVarName("Input"),
ctx->GetInputDim("Input"));
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.GetPlace());
}
};
template <typename T>
class SlogDeterminantGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("slogdeterminant_grad");
grad_op->SetInput("Input", this->Input("Input"));
grad_op->SetInput("Out", this->Output("Out"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("Input"),
this->InputGrad("Input"));
grad_op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
"Input");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant,
SlogDeterminantInferShapeFunctor,
PD_INFER_META(phi::UnchangedInferMeta));
REGISTER_OPERATOR(slogdeterminant,
ops::SlogDeterminantOp,
ops::SlogDeterminantOpMaker,
ops::SlogDeterminantGradOpMaker<paddle::framework::OpDesc>,
ops::SlogDeterminantGradOpMaker<paddle::imperative::OpBase>,
SlogDeterminantInferShapeFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant_grad,
SlogDeterminantGradInferShapeFunctor,
PD_INFER_META(phi::GeneralUnaryGradInferMeta));
REGISTER_OPERATOR(slogdeterminant_grad,
ops::SlogDeterminantGradOp,
SlogDeterminantGradInferShapeFunctor) // reuse det grad op
...@@ -19,6 +19,9 @@ limitations under the License. */ ...@@ -19,6 +19,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -63,6 +66,34 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -63,6 +66,34 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
class ElementwiseMulGradCompositeOpMaker
: public prim::GradCompositeOpMakerBase {
using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
public:
void Apply() override {
auto x = this->GetSingleForwardInput("X");
auto y = this->GetSingleForwardInput("Y");
auto out_grad = this->GetSingleOutputGrad("Out");
auto x_grad = this->GetSingleInputGrad("X");
auto x_grad_p = this->GetOutputPtr(&x_grad);
auto x_grad_name = this->GetOutputName(x_grad);
auto y_grad = this->GetSingleInputGrad("Y");
auto y_grad_p = this->GetOutputPtr(&y_grad);
auto y_grad_name = this->GetOutputName(y_grad);
prim::multiply_grad<prim::DescTensor>(
x,
y,
out_grad,
static_cast<int>(this->Attr<int>("axis")),
x_grad_p,
y_grad_p);
VLOG(3) << "Runing mul_grad composite func";
this->RecoverOutputName(x_grad, x_grad_name);
this->RecoverOutputName(y_grad, y_grad_name);
}
};
template <typename T> template <typename T>
class ElementwiseMulDoubleGradMaker : public framework::SingleGradOpMaker<T> { class ElementwiseMulDoubleGradMaker : public framework::SingleGradOpMaker<T> {
public: public:
...@@ -123,7 +154,8 @@ REGISTER_OPERATOR(elementwise_mul, ...@@ -123,7 +154,8 @@ REGISTER_OPERATOR(elementwise_mul,
ops::ElementwiseMulOpMaker, ops::ElementwiseMulOpMaker,
ops::ElementwiseOpInferVarType, ops::ElementwiseOpInferVarType,
ops::ElementwiseMulOpGradMaker<paddle::framework::OpDesc>, ops::ElementwiseMulOpGradMaker<paddle::framework::OpDesc>,
ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>); ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>,
ops::ElementwiseMulGradCompositeOpMaker);
REGISTER_OPERATOR( REGISTER_OPERATOR(
elementwise_mul_grad, elementwise_mul_grad,
ops::ElementwiseOpGrad, ops::ElementwiseOpGrad,
......
...@@ -20,6 +20,9 @@ limitations under the License. */ ...@@ -20,6 +20,9 @@ limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/infermeta/unary.h"
...@@ -190,6 +193,24 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -190,6 +193,24 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
class ExpandV2GradCompositeOpMaker : public prim::GradCompositeOpMakerBase {
using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
public:
void Apply() override {
auto x = this->GetSingleForwardInput("X");
auto out_grad = this->GetSingleOutputGrad("Out");
auto x_grad = this->GetSingleInputGrad("X");
auto x_grad_p = this->GetOutputPtr(&x_grad);
auto x_grad_name = this->GetOutputName(x_grad);
auto shape = this->Attr<std::vector<int>>("shape");
prim::expand_grad<prim::DescTensor>(
x, out_grad, paddle::experimental::IntArray(shape), x_grad_p);
VLOG(3) << "Runing expand_v2 composite func";
this->RecoverOutputName(x_grad, x_grad_name);
}
};
template <typename T> template <typename T>
class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> { class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public: public:
...@@ -223,6 +244,7 @@ namespace ops = paddle::operators; ...@@ -223,6 +244,7 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(expand_v2, REGISTER_OPERATOR(expand_v2,
ops::ExpandV2Op, ops::ExpandV2Op,
ops::ExpandV2OpMaker, ops::ExpandV2OpMaker,
ops::ExpandV2GradCompositeOpMaker,
ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>, ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>, ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>,
ExpandInferShapeFunctor); ExpandInferShapeFunctor);
......
...@@ -256,6 +256,16 @@ def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict): ...@@ -256,6 +256,16 @@ def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
op_item['no_need_buffer'] = get_param_list_alias( op_item['no_need_buffer'] = get_param_list_alias(
op_item['no_need_buffer'], args_map op_item['no_need_buffer'], args_map
) )
if 'data_transform' in op_item and op_item['data_transform']:
data_trans_item = op_item['data_transform']
if 'skip_transform' in data_trans_item:
data_trans_item['skip_transform'] = get_param_list_alias(
data_trans_item['skip_transform'], args_map
)
if 'support_trans_dtype' in data_trans_item:
data_trans_item['support_trans_dtype'] = get_param_list_alias(
data_trans_item['support_trans_dtype'], args_map
)
process_scalar(op_item, scalar_configs) process_scalar(op_item, scalar_configs)
process_int_array(op_item, int_array_configs) process_int_array(op_item, int_array_configs)
......
...@@ -427,7 +427,41 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"): ...@@ -427,7 +427,41 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
else: else:
no_buffer_args = None no_buffer_args = None
# TODO(chenfeiyu): data_transform # add data_transform tag for every input.
# the format is {data_transform : {skip_transform : [x, z], support_trans_dtype : y}}
for input in inputs:
input["data_transform"] = {}
if "data_transform" in op_entry:
skip_trans_args = []
support_trans_args = []
data_trans = op_entry["data_transform"]
if "skip_transform" in data_trans:
skip_trans_args = parse_plain_list(data_trans["skip_transform"])
for name in skip_trans_args:
assert (
name in input_names
), f"{op_name} has an skip_transform input: '{name}' which is not an input."
data_trans["skip_transform"] = skip_trans_args
if "support_trans_dtype" in data_trans:
support_trans_args = parse_plain_list(
data_trans["support_trans_dtype"]
)
for name in support_trans_args:
assert (
name in input_names
), f"{op_name} has an support_trans_dtype input: '{name}' which is not an input."
data_trans["support_trans_dtype"] = support_trans_args
for input in inputs:
if input["name"] in skip_trans_args:
input["data_transform"]["skip_trans_args"] = True
else:
input["data_transform"]["skip_trans_args"] = False
if input["name"] in support_trans_args:
input["data_transform"]["support_trans_dtype"] = True
else:
input["data_transform"]["support_trans_dtype"] = False
else:
data_trans = None
op = { op = {
"name": op_name, "name": op_name,
...@@ -435,6 +469,7 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"): ...@@ -435,6 +469,7 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
"attrs": attrs, "attrs": attrs,
"outputs": outputs, "outputs": outputs,
"no_need_buffer": no_buffer_args, "no_need_buffer": no_buffer_args,
"data_transform": data_trans,
} }
# invokes another op ? # invokes another op ?
......
...@@ -117,6 +117,15 @@ static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}} ...@@ -117,6 +117,15 @@ static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}}
{# --------------------------------------- name mapping ---------------------------------------------- #} {# --------------------------------------- name mapping ---------------------------------------------- #}
{% macro name_map(op) %} {% macro name_map(op) %}
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
{% set kernel_args = op["kernel"]["param"] %} {% set kernel_args = op["kernel"]["param"] %}
{{get_input_list(op["inputs"], kernel_args)}}; {{get_input_list(op["inputs"], kernel_args)}};
...@@ -136,15 +145,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum ...@@ -136,15 +145,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
return sig; return sig;
{%endif%} {%endif%}
} }
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
{% endmacro %} {% endmacro %}
{% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #} {% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #}
...@@ -172,6 +172,15 @@ ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}} ...@@ -172,6 +172,15 @@ ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
{%- endmacro %} {%- endmacro %}
{% macro sparse_op_name_map(op) %} {% macro sparse_op_name_map(op) %}
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
{% set kernel_args = op["kernel"]["param"] %} {% set kernel_args = op["kernel"]["param"] %}
{{get_input_list(op["inputs"], kernel_args)}}; {{get_input_list(op["inputs"], kernel_args)}};
...@@ -188,15 +197,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum ...@@ -188,15 +197,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs));
return sig; return sig;
} }
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
{% endmacro %} {% endmacro %}
{% macro register_base_kernel_name(op) %} {% macro register_base_kernel_name(op) %}
...@@ -284,6 +284,32 @@ phi::KernelKey GetExpectedKernelType( ...@@ -284,6 +284,32 @@ phi::KernelKey GetExpectedKernelType(
} }
{% endmacro %} {% endmacro %}
{% macro get_kernel_for_var(op) %} {# only for data_transform #}
{% set skip_args = op["data_transform"]["skip_transform"] %}
{% set var_name = "var_name" %}
{% set skip_args_len = skip_args | length %}
phi::KernelKey GetKernelTypeForVar(
const std::string& {{var_name}},
const phi::DenseTensor& tensor,
const phi::KernelKey& expected_kernel_type) const override {
if (
{%- for skip_arg in skip_args -%}
var_name == "{{ skip_arg }}"
{%- if skip_args_len != 1 and loop.index != skip_args_len %} || {% endif -%}
{%- endfor -%}
){
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
}
else{
return phi::KernelKey(
tensor.place(), tensor.layout(), expected_kernel_type.dtype());
}
}
{% endmacro %}
{# --------------------------------------- operator ---------------------------------------------- #} {# --------------------------------------- operator ---------------------------------------------- #}
{% macro operator(op) %} {% macro operator(op) %}
class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel { class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
...@@ -293,9 +319,17 @@ class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKerne ...@@ -293,9 +319,17 @@ class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKerne
{% set kernel = op["kernel"] %} {% set kernel = op["kernel"] %}
{% if kernel["data_type"] is not none %} {% if kernel["data_type"] is not none %}
protected: protected:
{% filter indent(2, True)%} {% filter indent(2, True)%}
{{get_expected_kernel(op)}} {{get_expected_kernel(op)}}
{% endfilter %} {% endfilter %}
{%- if "data_transform" in op and op["data_transform"] is not none -%}
{%- if "skip_transform" in op["data_transform"] -%}
{% filter indent(2, True) %}
{{get_kernel_for_var(op)}}
{% endfilter %}
{%- endif %}
{%- endif -%}
{# TODO(lizhiyu): add the 'support_trans_dtype' #}
{% endif %} {% endif %}
}; };
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/ternary.h"
namespace paddle {
namespace operators {
class GraphSendRecvOP : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context().GetPlace());
}
};
class GraphSendRecvGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context().GetPlace());
}
};
class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor with data type float32, float64, int32, int64.");
AddInput("Src_index", "The source index tensor.");
AddInput("Dst_index", "The destination index tensor.");
AddInput("Out_size",
"(Tensor<int>, optional). The 0th dimension of the output."
"It has a higher priority than Attr(out_size).")
.AsDispensable();
AddOutput("Out", "Output tensor of graph_send_recv op.");
AddOutput("Dst_count",
"Count tensor of Dst_index, mainly for MEAN reduce_op.")
.AsIntermediate();
AddAttr<std::string>("reduce_op",
"(string, default 'SUM')"
"Define different pool types to receive the result "
"tensors of Dst_index.")
.SetDefault("SUM")
.InEnum({"SUM", "MEAN", "MIN", "MAX"});
AddAttr<std::vector<int64_t>>(
"out_size",
"(vector<int64_t>, default {0})"
"Define the first dimension of Output tensor."
"If set default {0}, then the shape of Out is the same with X.")
.SetDefault({0});
AddComment(R"DOC(
Graph Learning Send_Recv combine operator.
$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
intermediate memory consumption in the process of message passing.
Take `x` as the input tensor, we first use `src_index` to gather corresponding data,
and then use `dst_index` to update the corresponding position of output tensor in different
pooling types, like sum, mean, max, or min.
)DOC");
}
};
template <typename T>
class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("graph_send_recv_grad");
op->SetInput("Src_index", this->Input("Src_index"));
op->SetInput("Dst_index", this->Input("Dst_index"));
op->SetInput("X", this->Input("X"));
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
}
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
op->SetInput("Out", this->Output("Out"));
}
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv,
GraphSendRecvInferShapeFunctor,
PD_INFER_META(phi::SendURecvInferMeta));
REGISTER_OPERATOR(graph_send_recv,
ops::GraphSendRecvOP,
ops::GraphSendRecvOpMaker,
ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
GraphSendRecvInferShapeFunctor);
REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
class GraphSendUERecvOP : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context().GetPlace());
}
};
class GraphSendUERecvGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
auto y_dims = ctx->GetInputDim("Y");
ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context().GetPlace());
}
};
class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor with data type float32, float64, int32, int64.");
AddInput("Y",
"The input edge weight tensor, data type should be same with X");
AddInput("Src_index", "The source index tensor.");
AddInput("Dst_index", "The destination index tensor.");
AddInput("Out_size",
"(Tensor<int>, optional). The 0th dimension of the output."
"It has a higher priority than Attr(out_size).")
.AsDispensable();
AddOutput("Out", "Output tensor of graph_send_ue_recv op.");
AddOutput("Dst_count",
"Count tensor of Dst_index, mainly for MEAN reduce_op.")
.AsIntermediate();
AddAttr<std::string>("message_op",
"(string, default 'ADD')"
"Define differenct computation types between X and E.")
.SetDefault("ADD")
.InEnum({"ADD", "MUL"});
AddAttr<std::string>("reduce_op",
"(string, default 'SUM')"
"Define different pool types to receive the result "
"tensors of Dst_index.")
.SetDefault("SUM")
.InEnum({"SUM", "MEAN", "MIN", "MAX"});
AddAttr<std::vector<int64_t>>(
"out_size",
"(vector<int64_t>, default {0})"
"Define the first dimension of Output tensor."
"If set default {0}, then the shape of Out is the same with X.")
.SetDefault({0});
AddComment(R"DOC(
Graph Learning Send_UE_Recv combine operator.
$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
intermediate memory consumption in the process of message passing.
Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
and get the computation result. Then, use `dst_index` to update the corresponding position of output
tensor in different pooling types, like sum, mean, max, or min.
)DOC");
}
};
template <typename T>
class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("graph_send_ue_recv_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("Y", this->Input("Y"));
op->SetInput("Src_index", this->Input("Src_index"));
op->SetInput("Dst_index", this->Input("Dst_index"));
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
}
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
op->SetInput("Out", this->Output("Out"));
}
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
op->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv,
GraphSendUERecvInferShapeFunctor,
PD_INFER_META(phi::SendUERecvInferMeta));
REGISTER_OPERATOR(graph_send_ue_recv,
ops::GraphSendUERecvOP,
ops::GraphSendUERecvOpMaker,
ops::GraphSendUERecvGradOpMaker<paddle::framework::OpDesc>,
ops::GraphSendUERecvGradOpMaker<paddle::imperative::OpBase>,
GraphSendUERecvInferShapeFunctor);
REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp);
...@@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase { ...@@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase {
// get output orginal name // get output orginal name
std::string x_grad_name = this->GetOutputName(x_grad_t); std::string x_grad_name = this->GetOutputName(x_grad_t);
VLOG(3) << "Runing sum_grad composite func";
// call composite backward func // call composite backward func
prim::sum_grad<prim::DescTensor>( prim::sum_grad<prim::DescTensor>(
x, out_grad, axis, keep_dim, reduce_all, x_grad); x, out_grad, axis, keep_dim, reduce_all, x_grad);
......
...@@ -95,7 +95,8 @@ class SelectOutputInferShape : public framework::InferShapeBase { ...@@ -95,7 +95,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
void operator()(framework::InferShapeContext *context) const override { void operator()(framework::InferShapeContext *context) const override {
OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput"); OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput");
OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput"); OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput");
OP_INOUT_CHECK(context->HasOutputs("Out"), "Output", "Out", "SelectOutput"); OP_INOUT_CHECK(
context->HasOutputs("Out", true), "Output", "Out", "SelectOutput");
} }
}; };
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
class SizeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto dtype = framework::proto::VarType::FP32; // dtype is not important
return phi::KernelKey(dtype, ctx.GetPlace());
}
phi::KernelKey GetKernelTypeForVar(
const std::string& var_name,
const phi::DenseTensor& tensor,
const phi::KernelKey& expected_kernel_type) const override {
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
}
};
class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input", "The input tensor.");
AddOutput("Out",
"The returned tensor, the data type "
"is int64_t, will be on the same device with the input Tensor.");
AddComment(R"DOC(
Size Operator.
Return the number of elements in the input.
)DOC");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(SizeOpNoNeedBufferVarInferer, "Input");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(size,
SizeInferShapeFunctor,
PD_INFER_META(phi::NumelInferMeta));
REGISTER_OPERATOR(
size,
ops::SizeOp,
ops::SizeOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
SizeInferShapeFunctor,
ops::SizeOpNoNeedBufferVarInferer);
...@@ -407,8 +407,8 @@ int BertTokenizer::Encode( ...@@ -407,8 +407,8 @@ int BertTokenizer::Encode(
void BertTokenizer::BatchEncode( void BertTokenizer::BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs, vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text, const framework::Strings& batch_text,
const vector<string>& batch_text_pair /* = vector<string>() */, const framework::Strings& batch_text_pair /* = vector<string>() */,
bool is_split_into_words /* = false */, bool is_split_into_words /* = false */,
const size_t max_seq_len /* = 0 */, const size_t max_seq_len /* = 0 */,
bool pad_to_max_seq_len /* = false */) const { bool pad_to_max_seq_len /* = false */) const {
......
...@@ -100,8 +100,8 @@ class BertTokenizer { ...@@ -100,8 +100,8 @@ class BertTokenizer {
bool pad_to_max_seq_len = false) const; bool pad_to_max_seq_len = false) const;
void BatchEncode( void BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs, vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text, const framework::Strings& batch_text,
const vector<string>& batch_text_pair = vector<string>(), const framework::Strings& batch_text_pair = framework::Strings(),
bool is_split_into_words = false, bool is_split_into_words = false,
const size_t max_seq_len = 0, const size_t max_seq_len = 0,
bool pad_to_max_seq_len = false) const; bool pad_to_max_seq_len = false) const;
...@@ -162,7 +162,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> { ...@@ -162,7 +162,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
} else { } else {
tokenizer.BatchEncode(&batch_encode_inputs, tokenizer.BatchEncode(&batch_encode_inputs,
*text, *text,
vector<string>(), framework::Strings(),
is_split_into_words, is_split_into_words,
max_seq_len, max_seq_len,
pad_to_max_seq_len); pad_to_max_seq_len);
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
DECLARE_bool(use_stream_safe_cuda_allocator); DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_bool(new_executor_use_cuda_graph);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -43,7 +44,10 @@ void BeginCUDAGraphCapture(phi::GPUPlace place, ...@@ -43,7 +44,10 @@ void BeginCUDAGraphCapture(phi::GPUPlace place,
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
CUDAGraph::BeginCapture(place, stream, mode); CUDAGraph::BeginCapture(place, stream, mode);
auto old_value = FLAGS_use_stream_safe_cuda_allocator; // When using cuda graph in new executor, fast GC must be used.
// FLAGS_use_stream_safe_cuda_allocator should be true.
auto old_value = FLAGS_use_stream_safe_cuda_allocator &&
!FLAGS_new_executor_use_cuda_graph;
if (old_value) { if (old_value) {
FLAGS_use_stream_safe_cuda_allocator = false; FLAGS_use_stream_safe_cuda_allocator = false;
} }
......
...@@ -20,6 +20,7 @@ namespace platform { ...@@ -20,6 +20,7 @@ namespace platform {
void CudaProfilerInit(const std::string& output_file, void CudaProfilerInit(const std::string& output_file,
const std::string& output_mode, const std::string& output_mode,
const std::string& config_file) { const std::string& config_file) {
#if CUDA_VERSION < 11000
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv", PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Unsupported cuda profiler output mode, expect `kvp` or " "Unsupported cuda profiler output mode, expect `kvp` or "
...@@ -28,6 +29,7 @@ void CudaProfilerInit(const std::string& output_file, ...@@ -28,6 +29,7 @@ void CudaProfilerInit(const std::string& output_file,
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
#endif
} }
void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
......
...@@ -183,6 +183,7 @@ class XPUDeviceContext : public phi::XPUContext { ...@@ -183,6 +183,7 @@ class XPUDeviceContext : public phi::XPUContext {
virtual ~XPUDeviceContext(); virtual ~XPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; } Eigen::DefaultDevice* eigen_device() const { return nullptr; }
xpuStream stream() const { return XPUContext::x_context()->xpu_stream; } xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
void CreateStream() { XPUContext::CreateStream(); }
}; };
template <> template <>
......
generated/prim_api/eager_prim_api.cc
generated/prim_api/tmp_eager_prim_api.cc
generated/prim_api/*.h
add_subdirectory(auto_code_generated)
add_subdirectory(manual) add_subdirectory(manual)
add_subdirectory(generated)
if(NOT (NOT WITH_PYTHON AND ON_INFER)) if(NOT (NOT WITH_PYTHON AND ON_INFER))
cc_library( cc_library(
prim_api prim_api
......
...@@ -13,6 +13,6 @@ ...@@ -13,6 +13,6 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h" #include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/utils/utils.h" #include "paddle/fluid/prim/api/manual/utils/utils.h"
set(api_yaml_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml"
)
set(legacy_api_yaml_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml"
)
set(tmp_eager_prim_api_cc_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_eager_prim_api.cc"
)
set(tmp_prim_api_h_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_prim_api.h"
)
set(eager_prim_api_cc_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc"
)
set(prim_api_h_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/prim_api.h")
set(prim_api_gen_file
${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated/prim_gen.py)
message("prim api Code gen")
execute_process(
WORKING_DIRECTORY
${CMAKE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated
COMMAND
${PYTHON_EXECUTABLE} ${prim_api_gen_file} --api_yaml_path
${legacy_api_yaml_path} ${api_yaml_path} --prim_api_header_path
${tmp_prim_api_h_path} --eager_prim_api_source_path
${tmp_eager_prim_api_cc_path}
RESULT_VARIABLE _result)
if(${_result})
message(FATAL_ERROR "prim api genrate failed, exiting.")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
${tmp_prim_api_h_path} ${prim_api_h_path})
execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
${tmp_eager_prim_api_cc_path} ${eager_prim_api_cc_path})
message("copy tmp_xxx_prim_api to xxx_prim_api")
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# prim api list
white_ops_list = [
"pow",
"scale",
"multiply",
"unsqueeze",
"expand",
"full",
"reshape",
"divide",
"sum",
"exp",
]
inplace_out_type_map = {
"Tensor": "Tensor&",
"std::vector<Tensor>": "std::vector<Tensor>&",
}
inplace_optional_out_type_map = {
"Tensor": "paddle::optional<Tensor>&",
"std::vector<Tensor>": "paddle::optional<std::vector<Tensor>>&",
}
class BaseAPI:
def __init__(self, api_item_yaml):
# self.api = api_item_yaml['op']
self.api = api_item_yaml['name']
self.is_prim_api = False
if api_item_yaml['name'] in white_ops_list:
self.is_prim_api = True
#######################################
# inputs:
# names : [], list of input names
# input_info : {input_name : type}
# attrs:
# names : [], list of attribute names
# attr_info : { attr_name : (type, default_values)}
# outputs:
# names : [], list of output names
# types : [], list of output types
# out_size_expr : [], expression for getting size of vector<Tensor>
########################################
if self.is_prim_api:
(
self.inputs,
self.attrs,
self.outputs,
self.optional_vars,
) = self.parse_args(self.api, api_item_yaml)
self.inplace_map = api_item_yaml['inplace']
def get_api_func_name(self):
return self.api
# def is_inplace(self):
# if self.inplace_map
# return True
# return False
def get_input_tensor_args(self, inplace_flag=False):
input_args = []
inplace_type_map = {
"const Tensor&": "Tensor&",
"const paddle::optional<Tensor>&": "paddle::optional<Tensor>&",
"const std::vector<Tensor>&": "std::vector<Tensor>&",
"const paddle::optional<std::vector<Tensor>>&": "paddle::optional<std::vector<Tensor>>&",
}
for name in self.inputs['names']:
name = name.split('@')[0]
if inplace_flag and name in self.inplace_map.values():
input_args.append(
inplace_type_map[self.inputs['input_info'][name]]
+ ' '
+ name
)
else:
input_args.append(self.inputs['input_info'][name] + ' ' + name)
return input_args
def get_declare_args(self, inplace_flag=False):
declare_args = self.get_input_tensor_args(inplace_flag)
for name in self.attrs['names']:
default_value = ''
if self.attrs['attr_info'][name][1] is not None:
default_value = ' = ' + self.attrs['attr_info'][name][1]
declare_args.append(
self.attrs['attr_info'][name][0] + ' ' + name + default_value
)
return ", ".join(declare_args)
def get_declare_args_nodefault(self, inplace_flag=False):
declare_args = self.get_input_tensor_args(inplace_flag)
for name in self.attrs['names']:
declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
return ", ".join(declare_args)
def get_return_type(self, inplace_flag=False):
out_type_list = []
for i, out_type in enumerate(self.outputs['types']):
out_name = self.outputs['names'][i].split('@')[0]
if inplace_flag and out_name in self.inplace_map:
if self.inplace_map[out_name] in self.optional_vars:
out_type_list.append(
inplace_optional_out_type_map[out_type]
)
else:
out_type_list.append(inplace_out_type_map[out_type])
else:
out_type_list.append(out_type)
if len(out_type_list) == 1:
return out_type_list[0]
else:
return "std::tuple<" + ", ".join(out_type_list) + ">"
def parse_args(self, api_name, api_item_yaml):
optional_vars = []
for input_dict in api_item_yaml['inputs']:
if input_dict['optional']:
optional_vars.append(input_dict['name'])
inputs, attrs = self.parse_input_and_attr(
api_item_yaml['inputs'], api_item_yaml['attrs']
)
output_type_list, output_names, out_size_expr = self.parse_output(
api_item_yaml['outputs']
)
return (
inputs,
attrs,
{
'names': output_names,
'types': output_type_list,
'out_size_expr': out_size_expr,
},
optional_vars,
)
def parse_input_and_attr(self, inputs_list, attrs_list):
input_types_map = {
'Tensor': 'const Tensor&',
'Tensor[]': 'const std::vector<Tensor>&',
}
attr_types_map = {
'IntArray': 'const IntArray&',
'Scalar': 'const Scalar&',
'Scalar(int)': 'const Scalar&',
'Scalar(int64_t)': 'const Scalar&',
'Scalar(float)': 'const Scalar&',
'Scalar(dobule)': 'const Scalar&',
'Scalar[]': 'const std::vector<phi::Scalar>&',
'int': 'int',
'int32_t': 'int32_t',
'int64_t': 'int64_t',
'long': 'long',
'size_t': 'size_t',
'float': 'float',
'float[]': 'const std::vector<float>&',
'double': 'double',
'bool': 'bool',
'bool[]': 'const std::vector<bool>&',
'str': 'const std::string&',
'str[]': 'const std::vector<std::string>&',
'Place': 'const Place&',
'DataLayout': 'DataLayout',
'DataType': 'DataType',
'int64_t[]': 'const std::vector<int64_t>&',
'int[]': 'const std::vector<int>&',
}
optional_types_trans = {
'Tensor': 'const paddle::optional<Tensor>&',
'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
'int': 'paddle::optional<int>',
'int32_t': 'paddle::optional<int32_t>',
'int64_t': 'paddle::optional<int64_t>',
'float': 'paddle::optional<float>',
'double': 'paddle::optional<double>',
'bool': 'paddle::optional<bool>',
'Place': 'paddle::optional<const Place&>',
'DataLayout': 'paddle::optional<DataLayout>',
'DataType': 'paddle::optional<DataType>',
}
inputs = {'names': [], 'input_info': {}}
for input_dict in inputs_list:
inputs['names'].append(input_dict['name'])
if input_dict['optional']:
inputs['input_info'][input_dict['name']] = optional_types_trans[
input_dict['typename']
]
else:
inputs['input_info'][input_dict['name']] = input_types_map[
input_dict['typename']
]
attrs = {'names': [], 'attr_info': {}}
for attr_dict in attrs_list:
attrs['names'].append(attr_dict['name'])
if 'default_value' in attr_dict.keys():
default_value = attr_dict['default_value']
else:
default_value = None
if 'optional' in attr_dict.keys():
attrs['attr_info'][attr_dict['name']] = (
optional_types_trans[attr_dict['typename']],
default_value,
)
else:
attrs['attr_info'][attr_dict['name']] = (
attr_types_map[attr_dict['typename']],
default_value,
)
return inputs, attrs
def parse_output(self, outputs_list):
out_type_list = []
out_name_list = []
out_size_expr_list = []
for output_dict in outputs_list:
if output_dict['intermediate']:
continue
out_type_list.append(output_dict['typename'])
out_name_list.append(output_dict['name'])
if 'size' in output_dict.keys():
out_size_expr_list.append(output_dict['size'])
else:
out_size_expr_list.append(None)
return out_type_list, out_name_list, out_size_expr_list
class EagerPrimAPI(BaseAPI):
def __init__(self, api_item_yaml):
super().__init__(api_item_yaml)
def get_api__func_name(self):
api_func_name = self.api
# if self.is_inplace:
# if api_func_name[-1] != '_':
# api_func_name += '_'
# print("after api name", api_func_name)
return api_func_name
def gene_prim_api_declaration(self):
api_declaration = ""
api_func_name = self.get_api__func_name()
if api_func_name[-1] != '_':
api_declaration = f"""
template <typename T>
{self.get_return_type()} {api_func_name}({self.get_declare_args()});
"""
else:
api_declaration = (
api_declaration
+ f"""
template <typename T>
{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
"""
)
return api_declaration
def get_ad_func_input_args(self, inplace_flag=False):
input_args = []
for name in self.inputs['names']:
name = name.split('@')[0]
if inplace_flag and name in self.inplace_map.values():
input_args.append(name)
else:
input_args.append(name)
return input_args
def get_ad_func_args(self, inplace_flag=False):
ad_func_args = self.get_ad_func_input_args(inplace_flag)
for name in self.attrs['names']:
default_value = ''
if self.attrs['attr_info'][name][1] is not None:
default_value = ' = ' + self.attrs['attr_info'][name][1]
ad_func_args.append(name)
ad_func_args_str = ", ".join(ad_func_args)
return ad_func_args_str
def gene_ad_func_call(self):
api_func_name = self.get_api__func_name()
dygraph_ad_func_name = '::' + api_func_name + '_ad_func'
dygraph_ad_func_parameters = self.get_ad_func_args()
ad_func_call_str = f"""
VLOG(4) << "Eager Prim API {api_func_name}_ad_func call";
return {dygraph_ad_func_name}({dygraph_ad_func_parameters});
"""
# print("ad_func_call_str: ", ad_func_call_str)
return ad_func_call_str
def gene_eager_prim_api_code(self):
api_code = ""
indent = " "
api_func_name = self.get_api__func_name()
template = '<Tensor>'
# func decalaration
if api_func_name[-1] != '_':
api_code = f"""
template <>
{self.get_return_type()} {api_func_name}{template}({self.get_declare_args_nodefault()})
"""
else:
api_code = f"""
template <>
{self.get_return_type(inplace_flag=True)} {api_func_name}{template}({self.get_declare_args_nodefault(inplace_flag=True)})
"""
# func code
api_code = api_code + '{'
api_code += f"""{self.gene_ad_func_call()}"""
api_code += '}' + '\n'
return api_code
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import yaml
from prim_base import EagerPrimAPI
def header_include():
return """
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/place.h"
#include "paddle/utils/optional.h"
"""
def eager_source_include(header_file_path):
return """
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
"""
def api_namespace():
return (
"""
namespace paddle {
namespace prim {
""",
"""
using Tensor = paddle::experimental::Tensor;
using Scalar = paddle::experimental::Scalar;
using IntArray = paddle::experimental::IntArray;
using DataType = paddle::experimental::DataType;
""",
"""
} // namespace prim
} // namespace paddle
""",
)
def generate_api(api_yaml_path, header_file_path, eager_prim_source_file_path):
apis = []
for each_api_yaml in api_yaml_path:
with open(each_api_yaml, 'r') as f:
api_list = yaml.load(f, Loader=yaml.FullLoader)
if api_list:
apis.extend(api_list)
header_file = open(header_file_path, 'w')
eager_prim_source_file = open(eager_prim_source_file_path, 'w')
namespace = api_namespace()
header_file.write("#pragma once\n")
header_file.write(header_include())
header_file.write(namespace[0])
header_file.write(namespace[1])
include_header_file = (
"#include paddle/fluid/prim/api/generated/prim_api/prim_api.h"
)
eager_prim_source_file.write(eager_source_include(include_header_file))
eager_prim_source_file.write(namespace[0])
for api in apis:
prim_api = EagerPrimAPI(api)
if prim_api.is_prim_api:
header_file.write(prim_api.gene_prim_api_declaration())
eager_prim_source_file.write(prim_api.gene_eager_prim_api_code())
header_file.write(namespace[2])
eager_prim_source_file.write(namespace[2])
header_file.close()
eager_prim_source_file.close()
def main():
parser = argparse.ArgumentParser(
description='Generate PaddlePaddle C++ API files'
)
parser.add_argument(
'--api_yaml_path',
help='path to api yaml file',
nargs='+',
default=['paddle/phi/api/yaml/ops.yaml'],
)
parser.add_argument(
'--prim_api_header_path',
help='output of generated prim_api header code file',
default='paddle/fluid/prim/api/generated/prim_api/prim_api.h',
)
parser.add_argument(
'--eager_prim_api_source_path',
help='output of generated eager_prim_api source code file',
default='paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc',
)
options = parser.parse_args()
api_yaml_path = options.api_yaml_path
prim_api_header_file_path = options.prim_api_header_path
eager_prim_api_source_file_path = options.eager_prim_api_source_path
generate_api(
api_yaml_path,
prim_api_header_file_path,
eager_prim_api_source_file_path,
)
if __name__ == '__main__':
main()
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h" #include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/utils/utils.h" #include "paddle/fluid/prim/api/manual/utils/utils.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
...@@ -37,7 +38,7 @@ namespace paddle { ...@@ -37,7 +38,7 @@ namespace paddle {
namespace prim { namespace prim {
template <> template <>
Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) { Tensor pow<DescTensor>(const Tensor& x, const Scalar& y) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place()); Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp(); framework::OpDesc* op = block->AppendOp();
...@@ -55,7 +56,7 @@ Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) { ...@@ -55,7 +56,7 @@ Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
template <> template <>
Tensor scale<DescTensor>(const Tensor& x, Tensor scale<DescTensor>(const Tensor& x,
const paddle::experimental::Scalar& scale, const Scalar& scale,
float bias, float bias,
bool bias_after_scale) { bool bias_after_scale) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place()); Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
...@@ -95,63 +96,63 @@ Tensor multiply<DescTensor>(const Tensor& x, const Tensor& y) { ...@@ -95,63 +96,63 @@ Tensor multiply<DescTensor>(const Tensor& x, const Tensor& y) {
} }
template <> template <>
Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) { Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place()); Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp(); framework::OpDesc* op = block->AppendOp();
op->SetType("expand_v2"); op->SetType("unsqueeze2");
op->SetInput("X", op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()}); {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetOutput( op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()}); "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end()); std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
op->SetAttr("shape", new_shape); op->SetAttr("axes", new_shape);
op->CheckAttrs(); op->CheckAttrs();
op->InferVarType(block); op->InferVarType(block);
return out; return out;
} }
template <> template <>
Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) { Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
// Grad infershape
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place()); Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp(); framework::OpDesc* op = block->AppendOp();
op->SetType("elementwise_div"); op->SetType("expand_v2");
op->SetInput("X", op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()}); {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetInput("Y",
{std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
op->SetOutput( op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()}); "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
op->SetAttr("shape", new_shape);
op->CheckAttrs(); op->CheckAttrs();
op->InferVarType(block); op->InferVarType(block);
op->InferShape(*block);
return out; return out;
} }
template <> template <>
Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) { Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
// Grad infershape
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place()); Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp(); framework::OpDesc* op = block->AppendOp();
op->SetType("unsqueeze2"); op->SetType("elementwise_div");
op->SetInput("X", op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()}); {std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetInput("Y",
{std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
op->SetOutput( op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()}); "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
op->SetAttr("axes", new_shape);
op->CheckAttrs(); op->CheckAttrs();
op->InferVarType(block); op->InferVarType(block);
op->InferShape(*block);
return out; return out;
} }
template <> template <>
Tensor full<DescTensor>(paddle::experimental::IntArray shape, Tensor full<DescTensor>(const IntArray& shape,
paddle::experimental::Scalar value, const Scalar& value,
paddle::experimental::DataType dtype, DataType dtype,
paddle::platform::Place place) { const Place& place) {
// Grad infershape // Grad infershape
Tensor out = empty<DescTensor>({}, dtype, place); Tensor out = empty<DescTensor>({}, dtype, place);
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
...@@ -159,9 +160,8 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape, ...@@ -159,9 +160,8 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
op->SetType("fill_constant"); op->SetType("fill_constant");
op->SetAttr("shape", shape.GetData()); op->SetAttr("shape", shape.GetData());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
((dtype == paddle::experimental::DataType::FLOAT32) || ((dtype == DataType::FLOAT32) || (dtype == DataType::FLOAT64) ||
(dtype == paddle::experimental::DataType::FLOAT64) || (dtype == DataType::FLOAT16)),
(dtype == paddle::experimental::DataType::FLOAT16)),
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"We only support float32/float16 for full, but we got data type: %s", "We only support float32/float16 for full, but we got data type: %s",
...@@ -177,9 +177,9 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape, ...@@ -177,9 +177,9 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
} }
template <> template <>
Tensor sum<DescTensor>(Tensor x, Tensor sum<DescTensor>(const Tensor& x,
paddle::experimental::IntArray axis, const IntArray& axis,
paddle::experimental::DataType dtype, DataType dtype,
bool keepdim) { bool keepdim) {
// Grad infershape // Grad infershape
Tensor out = empty<DescTensor>({}, dtype, paddle::Place()); Tensor out = empty<DescTensor>({}, dtype, paddle::Place());
...@@ -199,12 +199,12 @@ Tensor sum<DescTensor>(Tensor x, ...@@ -199,12 +199,12 @@ Tensor sum<DescTensor>(Tensor x,
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()}); "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
op->CheckAttrs(); op->CheckAttrs();
op->InferVarType(block); op->InferVarType(block);
// TODO(jiabin): This may have runtime shape skip infershape for now. // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
return out; return out;
} }
template <> template <>
Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) { Tensor reshape<DescTensor>(const Tensor& x, const IntArray& shape) {
// Grad infershape // Grad infershape
Tensor out = empty<DescTensor>({}, x.dtype(), paddle::Place()); Tensor out = empty<DescTensor>({}, x.dtype(), paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
...@@ -222,7 +222,23 @@ Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) { ...@@ -222,7 +222,23 @@ Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()}); "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
op->CheckAttrs(); op->CheckAttrs();
op->InferVarType(block); op->InferVarType(block);
// TODO(jiabin): This may have runtime shape skip infershape for now. // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
return out;
}
template <>
Tensor exp<DescTensor>(const Tensor& x) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp();
op->SetType("exp");
op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
op->CheckAttrs();
op->InferVarType(block);
op->InferShape(*block);
return out; return out;
} }
} // namespace prim } // namespace prim
......
add_subdirectory(prim_api)
add_subdirectory(utils) add_subdirectory(utils)
...@@ -49,7 +49,7 @@ void subtract_grad(const Tensor& x, ...@@ -49,7 +49,7 @@ void subtract_grad(const Tensor& x,
sum<T>(scale_out_grad, phi::vectorize(reduce_dim), y.dtype(), false); sum<T>(scale_out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims())); auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
set_output<T>(dy_tmp, dy); set_output<T>(dy_tmp, dy);
// dy->set_impl(dy_tmp.impl());
} else { } else {
by_pass<T>(scale_out_grad, dy); by_pass<T>(scale_out_grad, dy);
} }
...@@ -62,7 +62,6 @@ void subtract_grad(const Tensor& x, ...@@ -62,7 +62,6 @@ void subtract_grad(const Tensor& x,
sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false); sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims())); auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
set_output<T>(dx_tmp, dx); set_output<T>(dx_tmp, dx);
// dx->set_impl(dx_tmp.impl());
} else { } else {
by_pass<T>(out_grad, dx); by_pass<T>(out_grad, dx);
} }
...@@ -84,7 +83,6 @@ void add_grad(const Tensor& x, ...@@ -84,7 +83,6 @@ void add_grad(const Tensor& x,
sum<T>(out_grad, phi::vectorize(reduce_dim), y.dtype(), false); sum<T>(out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims())); auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
set_output<T>(dy_tmp, dy); set_output<T>(dy_tmp, dy);
// dy->set_impl(dy_tmp.impl());
} else { } else {
by_pass<T>(out_grad, dy); by_pass<T>(out_grad, dy);
} }
...@@ -97,7 +95,6 @@ void add_grad(const Tensor& x, ...@@ -97,7 +95,6 @@ void add_grad(const Tensor& x,
sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false); sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims())); auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
set_output<T>(dx_tmp, dx); set_output<T>(dx_tmp, dx);
// dx->set_impl(dx_tmp.impl());
} else { } else {
by_pass<T>(out_grad, dx); by_pass<T>(out_grad, dx);
} }
...@@ -139,7 +136,6 @@ void sum_grad(const Tensor& x, ...@@ -139,7 +136,6 @@ void sum_grad(const Tensor& x,
x_grad_tmp = expand<T>(out_grad, x_dim); x_grad_tmp = expand<T>(out_grad, x_dim);
} }
set_output<T>(x_grad_tmp, x_grad); set_output<T>(x_grad_tmp, x_grad);
// x_grad->set_impl(x_grad_tmp.impl());
} }
template <typename T> template <typename T>
...@@ -163,10 +159,8 @@ void divide_grad(const Tensor& x, ...@@ -163,10 +159,8 @@ void divide_grad(const Tensor& x,
sum<T>(dy_res, phi::vectorize(reduce_dim), y.dtype(), false); sum<T>(dy_res, phi::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims())); auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
set_output<T>(dy_tmp, dy); set_output<T>(dy_tmp, dy);
// dy->set_impl(dy_tmp.impl());
} else { } else {
set_output<T>(dy_res, dy); set_output<T>(dy_res, dy);
// dy->set_impl(dy_res.impl());
} }
} // indicate we will compute dy } // indicate we will compute dy
if (dx) { if (dx) {
...@@ -181,10 +175,8 @@ void divide_grad(const Tensor& x, ...@@ -181,10 +175,8 @@ void divide_grad(const Tensor& x,
sum<T>(dx_res, phi::vectorize(reduce_dim), x.dtype(), false); sum<T>(dx_res, phi::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims())); auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
set_output<T>(dx_tmp, dx); set_output<T>(dx_tmp, dx);
// dx->set_impl(dx_tmp.impl());
} else { } else {
set_output<T>(dx_res, dx); set_output<T>(dx_res, dx);
// dx->set_impl(dx_res.impl());
} }
} // indicate we will compute dx } // indicate we will compute dx
} }
...@@ -196,7 +188,6 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) { ...@@ -196,7 +188,6 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
auto tmp = divide<T>(div_x, out); auto tmp = divide<T>(div_x, out);
auto x_grad_tmp = multiply<T>(out_grad, tmp); auto x_grad_tmp = multiply<T>(out_grad, tmp);
set_output<T>(x_grad_tmp, x_grad); set_output<T>(x_grad_tmp, x_grad);
// x_grad->set_impl(x_grad_tmp.impl());
} }
} }
} // namespace prim } // namespace prim
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
#include "paddle/phi/capi/include/wrapper_base.h"
namespace paddle {
namespace prim {
template <>
Tensor pow<Tensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
return ::pow_ad_func(x, y);
}
template <>
Tensor scale<Tensor>(const Tensor& x,
const paddle::experimental::Scalar& scale,
float bias,
bool bias_after_scale) {
return ::scale_ad_func(x, scale, bias, bias_after_scale);
}
template <>
Tensor multiply<Tensor>(const Tensor& x, const Tensor& y) {
return ::multiply_ad_func(x, y);
}
template <>
Tensor expand<Tensor>(const Tensor& x, const IntArray& shape) {
return ::expand_ad_func(x, shape);
}
template <>
Tensor unsqueeze<Tensor>(const Tensor& x, const IntArray& axis) {
return ::unsqueeze_ad_func(x, axis);
}
template <>
Tensor divide<Tensor>(const Tensor& x, const Tensor& y) {
return ::divide_ad_func(x, y);
}
template <>
Tensor full<Tensor>(paddle::experimental::IntArray shape,
paddle::experimental::Scalar value,
paddle::experimental::DataType dtype,
paddle::platform::Place place) {
return ::full_ad_func(shape, value, dtype, place);
}
template <>
Tensor sum<Tensor>(Tensor x, IntArray axis, DataType dtype, bool keepdim) {
return ::sum_ad_func(x, axis, dtype, keepdim);
}
template <>
Tensor reshape<Tensor>(Tensor x, IntArray shape) {
return ::reshape_ad_func(x, shape);
}
} // namespace prim
} // namespace paddle
...@@ -12,50 +12,15 @@ ...@@ -12,50 +12,15 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// prim api which can't be generated
#pragma once #pragma once
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
namespace paddle {
namespace prim {
using Tensor = paddle::experimental::Tensor;
using IntArray = paddle::experimental::IntArray;
using Scalar = paddle::experimental::Scalar;
template <typename T>
Tensor pow(const Tensor& x, const Scalar& y);
template <typename T>
Tensor scale(const Tensor& X,
const Scalar& scale,
float bias,
bool bias_after_scale);
template <typename T>
Tensor multiply(const Tensor& x, const Tensor& y);
template <typename T>
Tensor expand(const Tensor& x, const IntArray& shape);
template <typename T> namespace paddle {
Tensor unsqueeze(const Tensor& x, const IntArray& axis); namespace prim {} // namespace prim
template <typename T>
Tensor divide(const Tensor& x, const Tensor& y);
template <typename T>
Tensor full(IntArray shape,
Scalar value,
DataType dtype = DataType::FLOAT32,
Place place = CPUPlace());
template <typename T>
Tensor sum(Tensor x,
IntArray axis = {},
DataType dtype = DataType::UNDEFINED,
bool keepdim = false);
template <typename T>
Tensor reshape(Tensor x, IntArray shape);
} // namespace prim
} // namespace paddle } // namespace paddle
...@@ -16,11 +16,12 @@ ...@@ -16,11 +16,12 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
using IntArray = paddle::experimental::IntArray;
namespace paddle { namespace paddle {
namespace prim { namespace prim {
// We put some api like utils here // We put some api like utils here
...@@ -42,42 +43,40 @@ void set_output(const paddle::experimental::Tensor& x_tmp, ...@@ -42,42 +43,40 @@ void set_output(const paddle::experimental::Tensor& x_tmp,
paddle::experimental::Tensor* x); paddle::experimental::Tensor* x);
// These method don't need to be specified // These method don't need to be specified
static phi::DDim get_reduce_dims(const phi::DDim& x_dims, static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
const phi::DDim& y_dims) { const phi::DDim& in_dims) {
std::vector<int64_t> result; std::vector<int64_t> result;
PADDLE_ENFORCE_GE(phi::product(x_dims), int bat = dout_dims.size() - in_dims.size();
phi::product(y_dims),
phi::errors::InvalidArgument(
"Only x_dims >= y_dims is accepted for "
"get_reduce_dims, but we got x_dims: %s, y_dims: %s",
x_dims,
y_dims));
int bat = x_dims.size() - y_dims.size();
for (int i = 0; i < bat; ++i) { for (int i = 0; i < bat; ++i) {
result.push_back(i); result.push_back(i);
} }
for (int i = 0; i < y_dims.size(); ++i) { for (int i = 0; i < in_dims.size(); ++i) {
if (y_dims[i] == 1) { if (in_dims[i] == 1) {
result.push_back(i + bat); result.push_back(i + bat);
} else { } else {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
y_dims[i], in_dims[i],
x_dims[i + bat], dout_dims[i + bat],
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"ReduceDims dimension mismatch. Operands could " "ReduceDims dimension mismatch. Operands could "
"not be broadcast together with the shape of x_dims = [%s] and " "not be broadcast together with the shape of dout = [%s] and "
"the shape of y_dims = [%s]. Received [%d] in X is not equal to " "the shape of in_dims = [%s]. Received [%d] in X is not equal to "
"[%d] in Y at i:%d.", "[%d] in Y at i:%d.",
x_dims, dout_dims,
y_dims, in_dims,
x_dims[i + bat], dout_dims[i + bat],
y_dims[i], in_dims[i],
i)); i));
} }
} }
auto res_dims = phi::make_ddim(result); return phi::make_ddim(result);
VLOG(4) << "Reduce Dims is: " << res_dims; }
return res_dims;
static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
const phi::DDim& y_dims) {
auto out_dims = paddle::operators::details::BroadcastTwoDims(x_dims, y_dims);
return get_reduce_dims_from_out(out_dims, x_dims);
} }
} // namespace prim } // namespace prim
} // namespace paddle } // namespace paddle
...@@ -477,6 +477,9 @@ class GradCompositeOpMakerBase { ...@@ -477,6 +477,9 @@ class GradCompositeOpMakerBase {
void RecoverOutputName(const paddle::experimental::Tensor& output, void RecoverOutputName(const paddle::experimental::Tensor& output,
const std::string& origin_name) { const std::string& origin_name) {
if (origin_name == framework::kEmptyVarName) return; if (origin_name == framework::kEmptyVarName) return;
VLOG(4) << "Recover: "
<< static_cast<prim::DescTensor*>(output.impl().get())->Name()
<< " To: " << origin_name;
prim::StaticCompositeContext::Instance().GetBlock()->RenameVar( prim::StaticCompositeContext::Instance().GetBlock()->RenameVar(
static_cast<prim::DescTensor*>(output.impl().get())->Name(), static_cast<prim::DescTensor*>(output.impl().get())->Name(),
origin_name); origin_name);
......
...@@ -26,9 +26,9 @@ static PyObject *eager_api_run_program(PyObject *self, ...@@ -26,9 +26,9 @@ static PyObject *eager_api_run_program(PyObject *self,
PyObject *kwargs) { PyObject *kwargs) {
PyThreadState *tstate = nullptr; PyThreadState *tstate = nullptr;
try { try {
auto X = GetTensorListFromArgs("run_program", "X", args, 0, false); auto X = GetTensorListFromArgs("run_program", "X", args, 0, true);
auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true); auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false); auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true);
auto OutScope = auto OutScope =
GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false); GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true); auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
......
...@@ -1503,7 +1503,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self, ...@@ -1503,7 +1503,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
using Strings = std::vector<std::string>; using Strings = paddle::framework::Strings;
auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0); auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0);
auto var_tensor = std::make_shared<egr::VariableCompatTensor>(); auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
*var_tensor->GetMutable<Strings>() = strings; *var_tensor->GetMutable<Strings>() = strings;
......
...@@ -184,39 +184,41 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) { ...@@ -184,39 +184,41 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
value[i] = ddim[i]; value[i] = ddim[i];
} }
} }
if (!egr::IsVariableCompatTensor(self->tensor)) {
auto desired_layout = auto desired_layout =
paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
auto default_layout = auto default_layout =
paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
bool change_dim = bool change_dim =
(desired_layout != default_layout && (desired_layout != default_layout &&
self->tensor.layout() == desired_layout && value.size() == 4); self->tensor.layout() == desired_layout && value.size() == 4);
VLOG(6) << "eager_properties 'Shape' method, layout autotune " VLOG(6) << "eager_properties 'Shape' method, layout autotune "
<< " desired_layout: " << desired_layout << " desired_layout: " << desired_layout
<< " default_layout: " << default_layout << " default_layout: " << default_layout
<< " tensor layout: " << self->tensor.layout() << " tensor layout: " << self->tensor.layout()
<< " tensor's shape size is : " << value.size(); << " tensor's shape size is : " << value.size();
std::vector<int64_t> dims = value; std::vector<int64_t> dims = value;
if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") { if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") {
// NCHW -> NHWC // NCHW -> NHWC
VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " " VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0]
<< value[1] << " " << value[2] << " " << value[3] << " to " << " " << value[1] << " " << value[2] << " " << value[3] << " to "
<< dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1]; << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
value[0] = dims[0]; value[0] = dims[0];
value[1] = dims[2]; value[1] = dims[2];
value[2] = dims[3]; value[2] = dims[3];
value[3] = dims[1]; value[3] = dims[1];
} else if (change_dim && phi::DataLayoutToString(desired_layout) == "NHWC") { } else if (change_dim &&
// NHWC -> NCHW phi::DataLayoutToString(desired_layout) == "NHWC") {
VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " " // NHWC -> NCHW
<< value[1] << " " << value[2] << " " << value[3] << " to " VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0]
<< dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2] << " " << value[1] << " " << value[2] << " " << value[3] << " to "
<< " " << dims[1]; << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2]
value[0] = dims[0]; << " " << dims[1];
value[1] = dims[3]; value[0] = dims[0];
value[2] = dims[1]; value[1] = dims[3];
value[3] = dims[2]; value[2] = dims[1];
value[3] = dims[2];
}
} }
return ToPyObject(value); return ToPyObject(value);
......
...@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) { ...@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) {
m.def("_cleanup_mmap_fds", m.def("_cleanup_mmap_fds",
[]() { memory::allocation::MemoryMapFdSet::Instance().Clear(); }); []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
m.def("_set_max_memory_map_allocation_pool_size", [](int32_t size) {
memory::allocation::MemoryMapAllocationPool::Instance().SetMaxPoolSize(
size);
});
#endif #endif
m.def("start_imperative_gperf_profiler", m.def("start_imperative_gperf_profiler",
......
...@@ -970,7 +970,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -970,7 +970,7 @@ All parameter, weight, gradient are variables in Paddle.
} }
}) })
.def("set_string_list", .def("set_string_list",
[](Variable &self, Strings str_list) { [](Variable &self, std::vector<std::string> str_list) {
*self.GetMutable<Strings>() = str_list; *self.GetMutable<Strings>() = str_list;
}) })
.def("set_vocab", .def("set_vocab",
...@@ -1926,7 +1926,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1926,7 +1926,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable", m.def("set_feed_variable",
static_cast<void (*)( // NOLINT static_cast<void (*)( // NOLINT
Scope *, Scope *,
const Strings &, const std::vector<std::string> &,
const std::string &, const std::string &,
size_t)>(&framework::SetFeedVariable)); size_t)>(&framework::SetFeedVariable));
m.def("get_fetch_variable", m.def("get_fetch_variable",
......
...@@ -182,6 +182,7 @@ limitations under the License. */ ...@@ -182,6 +182,7 @@ limitations under the License. */
#include "pybind11/stl.h" #include "pybind11/stl.h"
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(use_shm_cache);
// disable auto conversion to list in Python // disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
...@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) { // NOLINT
int flags = memory::allocation::MAPPED_SHAREDMEM | int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_EXCLUSIVE; memory::allocation::MAPPED_EXCLUSIVE;
std::string handle = memory::allocation::GetIPCName(); std::string handle = memory::allocation::GetIPCName();
int find_id = -1;
if (FLAGS_use_shm_cache) {
find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, data_size); // NOLINT
}
if (find_id != -1) {
handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT
}
auto shared_holder = auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation( memory::allocation::AllocateRefcountedMemoryMapAllocation(
handle, flags, data_size); handle, flags, data_size, find_id);
// copy data & reset holder // copy data & reset holder
if (platform::is_cuda_pinned_place(holder->place())) { if (platform::is_cuda_pinned_place(holder->place())) {
...@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) { // NOLINT
size_t size = t[1].cast<size_t>(); size_t size = t[1].cast<size_t>();
int flags = memory::allocation::MAPPED_SHAREDMEM | int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_NOCREATE; memory::allocation::MAPPED_NOCREATE;
int find_id = -1;
if (FLAGS_use_shm_cache) {
find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT
}
auto shared_holder = auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation( memory::allocation::AllocateRefcountedMemoryMapAllocation(
ipc_name, flags, size); ipc_name, flags, size, find_id);
// 3. Rebuild Tensor // 3. Rebuild Tensor
tensor.ResetHolderWithType( tensor.ResetHolderWithType(
......
...@@ -22,6 +22,9 @@ limitations under the License. */ ...@@ -22,6 +22,9 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/string_tensor_utils.h" #include "paddle/phi/core/string_tensor_utils.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
namespace paddle { namespace paddle {
namespace experimental { namespace experimental {
...@@ -54,6 +57,11 @@ bool HasAllocation(const phi::TensorBase& t) { ...@@ -54,6 +57,11 @@ bool HasAllocation(const phi::TensorBase& t) {
BackendSet GetTensorBackendSet(const phi::TensorBase& t) { BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) { if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (t.place().GetType() == AllocationType::CUSTOM) {
phi::DeviceManager::SetDevice(t.place());
}
#endif
phi::Backend backend_key = phi::TransToPhiBackend(t.place()); phi::Backend backend_key = phi::TransToPhiBackend(t.place());
BackendSet backend_set(backend_key); BackendSet backend_set(backend_key);
if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) && if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) &&
......
...@@ -431,6 +431,7 @@ ...@@ -431,6 +431,7 @@
kernel : kernel :
func : exp_grad func : exp_grad
inplace : (out_grad -> x_grad) inplace : (out_grad -> x_grad)
composite : exp_grad(out, out_grad, x_grad)
- backward_op : expm1_grad - backward_op : expm1_grad
forward : expm1 (Tensor x) -> Tensor(out) forward : expm1 (Tensor x) -> Tensor(out)
...@@ -1085,6 +1086,30 @@ ...@@ -1085,6 +1086,30 @@
func : selu_grad func : selu_grad
data_type : out data_type : out
- backward_op : send_u_recv_grad
forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
param : [x]
kernel :
func : send_u_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : send_ue_recv_grad
forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : send_ue_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : send_uv_grad - backward_op : send_uv_grad
forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out) forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out)
args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD") args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD")
...@@ -1191,6 +1216,17 @@ ...@@ -1191,6 +1216,17 @@
func : sinh_grad func : sinh_grad
inplace : (out_grad -> x_grad) inplace : (out_grad -> x_grad)
- backward_op : slogdet_grad
forward : slogdet (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
param : [x]
kernel :
func : slogdet_grad
data_type : out_grad
- backward_op : softplus_grad - backward_op : softplus_grad
forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out) forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out)
args : (Tensor x, Tensor out_grad, float beta, float threshold) args : (Tensor x, Tensor out_grad, float beta, float threshold)
......
...@@ -316,9 +316,14 @@ ...@@ -316,9 +316,14 @@
- backward_op : cumsum_grad - backward_op : cumsum_grad
forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out) forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
args : (Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse) args : (Tensor x, Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
output : Tensor(x_grad) output : Tensor(x_grad)
invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse) infer_meta :
func : UnchangedInferMeta
param: [x]
kernel :
func : cumsum_grad
data_type: x
- backward_op : deformable_conv_grad - backward_op : deformable_conv_grad
forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out) forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out)
...@@ -475,6 +480,7 @@ ...@@ -475,6 +480,7 @@
func : expand_grad func : expand_grad
no_need_buffer : x no_need_buffer : x
backward : expand_double_grad backward : expand_double_grad
composite: expand_grad(x, out_grad, shape, x_grad_p)
- backward_op : exponential__grad - backward_op : exponential__grad
forward : exponential_ (Tensor x, float lam) -> Tensor(out) forward : exponential_ (Tensor x, float lam) -> Tensor(out)
...@@ -880,6 +886,7 @@ ...@@ -880,6 +886,7 @@
param : [x, y] param : [x, y]
kernel : kernel :
func : multiply_grad func : multiply_grad
composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad)
backward : multiply_double_grad backward : multiply_double_grad
- backward_op : multiply_triple_grad - backward_op : multiply_triple_grad
...@@ -1201,30 +1208,6 @@ ...@@ -1201,30 +1208,6 @@
data_type : x data_type : x
optional : summed_ids optional : summed_ids
- backward_op : send_u_recv_grad
forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
param : [x]
kernel :
func : send_u_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : send_ue_recv_grad
forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : send_ue_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : sigmoid_cross_entropy_with_logits_grad - backward_op : sigmoid_cross_entropy_with_logits_grad
forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out) forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out)
args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index) args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index)
...@@ -1260,16 +1243,6 @@ ...@@ -1260,16 +1243,6 @@
backward : slice_double_grad backward : slice_double_grad
no_need_buffer : input no_need_buffer : input
- backward_op : slogdet_grad
forward : slogdet (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
param : [x]
kernel :
func : slogdet_grad
- backward_op : softmax_grad - backward_op : softmax_grad
forward : softmax (Tensor x, int axis) -> Tensor(out) forward : softmax (Tensor x, int axis) -> Tensor(out)
args : (Tensor out, Tensor out_grad, int axis) args : (Tensor out, Tensor out_grad, int axis)
......
...@@ -1298,16 +1298,6 @@ ...@@ -1298,16 +1298,6 @@
kernel : kernel :
func : not_equal func : not_equal
- op : numel
args : (Tensor x)
output : Tensor(size)
infer_meta :
func : NumelInferMeta
kernel :
func : numel
data_transform:
skip_transform : x
- op : one_hot - op : one_hot
args : (Tensor x, Scalar(int) num_classes) args : (Tensor x, Scalar(int) num_classes)
output : Tensor(out) output : Tensor(out)
...@@ -1588,28 +1578,6 @@ ...@@ -1588,28 +1578,6 @@
data_type : x data_type : x
backward : segment_pool_grad backward : segment_pool_grad
- op : send_u_recv
args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendURecvInferMeta
kernel :
func : send_u_recv
data_type : x
intermediate : dst_count
backward : send_u_recv_grad
- op : send_ue_recv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size)
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendUERecvInferMeta
kernel :
func : send_ue_recv
data_type : x
intermediate : dst_count
backward : send_ue_recv_grad
- op : sgd_ - op : sgd_
args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision) args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision)
output : Tensor(param_out), Tensor(master_param_out) output : Tensor(param_out), Tensor(master_param_out)
...@@ -1663,15 +1631,6 @@ ...@@ -1663,15 +1631,6 @@
func : slice func : slice
backward : slice_grad backward : slice_grad
- op : slogdet
args : (Tensor x)
output : Tensor
infer_meta :
func : UnchangedInferMeta
kernel :
func : slogdet
backward : slogdet_grad
- op : softmax - op : softmax
args : (Tensor x, int axis) args : (Tensor x, int axis)
output : Tensor(out) output : Tensor(out)
......
...@@ -993,6 +993,12 @@ ...@@ -993,6 +993,12 @@
outputs : outputs :
{out : Out, total_weight : Total_weight} {out : Out, total_weight : Total_weight}
- op : numel(size)
inputs :
x : Input
outputs :
size : Out
- op : overlap_add - op : overlap_add
backward : overlap_add_grad backward : overlap_add_grad
inputs : inputs :
...@@ -1215,6 +1221,28 @@ ...@@ -1215,6 +1221,28 @@
outputs : outputs :
out : Out out : Out
- op : send_u_recv(graph_send_recv)
backward : send_u_recv_grad(graph_send_recv_grad)
inputs :
{x : X, src_index : Src_index, dst_index : Dst_index}
outputs :
{out : Out, dst_count : Dst_count}
int_array :
out_size:
data_type : int64_t
tensor_name : Out_size
- op : send_ue_recv(graph_send_ue_recv)
backward : send_ue_recv_grad(graph_send_ue_recv_grad)
inputs :
{x : X, y : Y, src_index : Src_index, dst_index : Dst_index}
outputs :
{out : Out, dst_count : Dst_count}
int_array :
out_size:
data_type : int64_t
tensor_name : Out_size
- op : send_uv (graph_send_uv) - op : send_uv (graph_send_uv)
backward : send_uv_grad (graph_send_uv_grad) backward : send_uv_grad (graph_send_uv_grad)
...@@ -1286,6 +1314,13 @@ ...@@ -1286,6 +1314,13 @@
extra : extra :
attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
- op : slogdet(slogdeterminant)
backward : slogdet_grad(slogdeterminant_grad)
inputs :
x : Input
outputs :
out : Out
- op : softmax - op : softmax
backward : softmax_grad backward : softmax_grad
inputs : inputs :
......
...@@ -871,6 +871,18 @@ ...@@ -871,6 +871,18 @@
kernel : kernel :
func : npu_identity func : npu_identity
- op : numel
args : (Tensor x)
output : Tensor(size)
infer_meta :
func : NumelInferMeta
kernel :
func : numel
data_type : x
data_transform:
skip_transform : x
no_need_buffer : x
- op : overlap_add - op : overlap_add
args: (Tensor x, int hop_length, int axis=-1) args: (Tensor x, int hop_length, int axis=-1)
output: Tensor output: Tensor
...@@ -1029,6 +1041,28 @@ ...@@ -1029,6 +1041,28 @@
func : selu func : selu
backward : selu_grad backward : selu_grad
- op : send_u_recv
args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendURecvInferMeta
kernel :
func : send_u_recv
data_type : x
intermediate : dst_count
backward : send_u_recv_grad
- op : send_ue_recv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendUERecvInferMeta
kernel :
func : send_ue_recv
data_type : x
intermediate : dst_count
backward : send_ue_recv_grad
- op : send_uv - op : send_uv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
output : Tensor(out) output : Tensor(out)
...@@ -1083,6 +1117,15 @@ ...@@ -1083,6 +1117,15 @@
func : sinh func : sinh
backward : sinh_grad backward : sinh_grad
- op : slogdet
args : (Tensor x)
output : Tensor
infer_meta :
func : UnchangedInferMeta
kernel :
func : slogdet
backward : slogdet_grad
- op : softplus - op : softplus
args : (Tensor x, float beta = 1.0, float threshold = 20.0f) args : (Tensor x, float beta = 1.0, float threshold = 20.0f)
output : Tensor output : Tensor
......
...@@ -360,6 +360,7 @@ XPUOpMap& get_kl2_ops() { ...@@ -360,6 +360,7 @@ XPUOpMap& get_kl2_ops() {
{"log_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"log_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})}, {"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})},
{"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"logical_not", XPUKernelSet({phi::DataType::BOOL})},
{"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})}, {"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})},
{"masked_select", {"masked_select",
......
...@@ -61,11 +61,13 @@ struct XPUContext::Impl { ...@@ -61,11 +61,13 @@ struct XPUContext::Impl {
~Impl() { ~Impl() {
if (owned_ && context_ != nullptr) { if (owned_ && context_ != nullptr) {
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
// manually destroy XPUStream here until xpu::api integrates this work
// into Context dtor
xpu_wait(context_->xpu_stream); xpu_wait(context_->xpu_stream);
xpu_stream_destroy(context_->xpu_stream); if (context_->xpu_stream) {
context_->xpu_stream = nullptr; // manually destroy XPUStream here until xpu::api integrates this work
// into Context dtor
xpu_stream_destroy(context_->xpu_stream);
context_->xpu_stream = nullptr;
}
xpu::destroy_context(context_); xpu::destroy_context(context_);
context_ = nullptr; context_ = nullptr;
} }
...@@ -73,11 +75,7 @@ struct XPUContext::Impl { ...@@ -73,11 +75,7 @@ struct XPUContext::Impl {
const Place& GetPlace() const { return place_; } const Place& GetPlace() const { return place_; }
XPUStream stream() const { XPUStream stream() const { return context_->xpu_stream; }
auto s = context_->xpu_stream;
PD_CHECK(s != nullptr, "the xpu stream is nullptr.");
return s;
}
xpu::Context* GetXContext() const { xpu::Context* GetXContext() const {
PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
...@@ -103,13 +101,20 @@ struct XPUContext::Impl { ...@@ -103,13 +101,20 @@ struct XPUContext::Impl {
context_ = xpu::create_context(); context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device); xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache(); SetL3Cache();
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
} }
void SetXContext(xpu::Context* context) { context_ = context; } void SetXContext(xpu::Context* context) { context_ = context; }
void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
void CreateStream() {
if (context_->xpu_stream) {
VLOG(3) << "xpu stream is already created for current context";
return;
}
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
}
bool owned_{false}; bool owned_{false};
Place place_; Place place_;
backends::xpu::XPUVersion xpu_version_; backends::xpu::XPUVersion xpu_version_;
...@@ -153,6 +158,8 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) { ...@@ -153,6 +158,8 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
impl_->SetBkclContext(context); impl_->SetBkclContext(context);
} }
void XPUContext::CreateStream() { impl_->CreateStream(); }
void XPUContext::Init() { impl_->Init(); } void XPUContext::Init() { impl_->Init(); }
} // namespace phi } // namespace phi
...@@ -46,6 +46,7 @@ class XPUContext : public DeviceContext, ...@@ -46,6 +46,7 @@ class XPUContext : public DeviceContext,
// Return bkcl context. // Return bkcl context.
xpu::BKCLContext_t bkcl_context() const; xpu::BKCLContext_t bkcl_context() const;
void SetBkclContext(xpu::BKCLContext_t context); void SetBkclContext(xpu::BKCLContext_t context);
void CreateStream();
// Wait for all operations completion in the stream. // Wait for all operations completion in the stream.
void Wait() const override; void Wait() const override;
......
...@@ -134,7 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { ...@@ -134,7 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
default: { default: {
size_t device_type_id_ = static_cast<size_t>(backend) - size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS); static_cast<size_t>(Backend::NUM_BACKENDS);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_); std::string device_type =
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
if (!device_type.empty()) { if (!device_type.empty()) {
os << device_type; os << device_type;
} else { } else {
...@@ -178,7 +180,8 @@ inline Backend StringToBackend(const char* backend_cstr) { ...@@ -178,7 +180,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::IPU; return Backend::IPU;
} else { } else {
return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) + return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
phi::GetOrRegisterGlobalDeviceTypeId(s)); phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(s));
} }
} }
...@@ -207,7 +210,9 @@ inline std::string BackendToString(const Backend& backend) { ...@@ -207,7 +210,9 @@ inline std::string BackendToString(const Backend& backend) {
default: default:
size_t device_type_id_ = static_cast<size_t>(backend) - size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS); static_cast<size_t>(Backend::NUM_BACKENDS);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_); std::string device_type =
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
if (!device_type.empty()) { if (!device_type.empty()) {
return device_type; return device_type;
} else { } else {
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <unordered_map>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/phi/api/ext/exception.h" #include "paddle/phi/api/ext/exception.h"
...@@ -54,7 +53,8 @@ std::string Place::DebugString() const { ...@@ -54,7 +53,8 @@ std::string Place::DebugString() const {
std::ostringstream os; std::ostringstream os;
os << "Place("; os << "Place(";
if (alloc_type_ == AllocationType::CUSTOM) { if (alloc_type_ == AllocationType::CUSTOM) {
os << GetGlobalDeviceType(device_type_id_); os << phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
} else { } else {
os << AllocationTypeStr(alloc_type_); os << AllocationTypeStr(alloc_type_);
} }
...@@ -85,25 +85,29 @@ Place GetPinnedPlace(const Place &place) { ...@@ -85,25 +85,29 @@ Place GetPinnedPlace(const Place &place) {
} }
} }
static std::unordered_map<std::string, size_t> global_registered_device_type_id; CustomRegisteredDeviceMap &CustomRegisteredDeviceMap::Instance() {
static std::unordered_map<size_t, std::string> global_registered_device_type; static CustomRegisteredDeviceMap g_custom_registered_device_map;
return g_custom_registered_device_map;
}
size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) { size_t CustomRegisteredDeviceMap::GetOrRegisterGlobalDeviceTypeId(
const std::string &device_type) {
if (device_type.empty()) return 0; if (device_type.empty()) return 0;
if (global_registered_device_type_id.find(device_type) == if (registered_device_type_id_.find(device_type) ==
global_registered_device_type_id.end()) { registered_device_type_id_.end()) {
size_t device_type_id = global_registered_device_type_id.size() + 1; size_t device_type_id = registered_device_type_id_.size() + 1;
global_registered_device_type_id[device_type] = device_type_id; registered_device_type_id_[device_type] = device_type_id;
global_registered_device_type[device_type_id] = device_type; registered_device_type_[device_type_id] = device_type;
} }
return global_registered_device_type_id[device_type]; return registered_device_type_id_[device_type];
} }
std::string GetGlobalDeviceType(size_t device_type_id) { std::string CustomRegisteredDeviceMap::GetGlobalDeviceType(
if (global_registered_device_type.find(device_type_id) == size_t device_type_id) {
global_registered_device_type.end()) if (registered_device_type_.find(device_type_id) ==
registered_device_type_.end())
return ""; return "";
return global_registered_device_type[device_type_id]; return registered_device_type_[device_type_id];
} }
constexpr static int kAllocationTypeBitLength = 8; constexpr static int kAllocationTypeBitLength = 8;
...@@ -143,7 +147,9 @@ static int8_t GetCorrectDeviceIdByPlaceType( ...@@ -143,7 +147,9 @@ static int8_t GetCorrectDeviceIdByPlaceType(
Place::Place(paddle::PlaceType type) Place::Place(paddle::PlaceType type)
: device(detail::GetCorrectDeviceIdByPlaceType(type)), : device(detail::GetCorrectDeviceIdByPlaceType(type)),
alloc_type_(static_cast<AllocationType>(type)), alloc_type_(static_cast<AllocationType>(type)),
device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) { device_type_id_(
CustomRegisteredDeviceMap::Instance().GetOrRegisterGlobalDeviceTypeId(
"")) {
LOG_FIRST_N(WARNING, 1) LOG_FIRST_N(WARNING, 1)
<< "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version " << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version "
"2.3, and will be removed in version 2.4! Please use " "2.3, and will be removed in version 2.4! Please use "
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <unordered_map>
#include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/api/include/dll_decl.h"
...@@ -37,11 +38,21 @@ enum class AllocationType : int8_t { ...@@ -37,11 +38,21 @@ enum class AllocationType : int8_t {
CUSTOM = 9, CUSTOM = 9,
}; };
const char* AllocationTypeStr(AllocationType type); class CustomRegisteredDeviceMap {
public:
static CustomRegisteredDeviceMap& Instance();
size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
std::string GetGlobalDeviceType(size_t device_type_id_); std::string GetGlobalDeviceType(size_t device_type_id_);
private:
CustomRegisteredDeviceMap() = default;
std::unordered_map<std::string, size_t> registered_device_type_id_;
std::unordered_map<size_t, std::string> registered_device_type_;
};
const char* AllocationTypeStr(AllocationType type);
/// \brief The place is used to specify where the data is stored. /// \brief The place is used to specify where the data is stored.
class PADDLE_API Place { class PADDLE_API Place {
...@@ -53,12 +64,14 @@ class PADDLE_API Place { ...@@ -53,12 +64,14 @@ class PADDLE_API Place {
const std::string& dev_type = "") const std::string& dev_type = "")
: device(id), : device(id),
alloc_type_(type), alloc_type_(type),
device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
explicit Place(AllocationType type, const std::string& dev_type = "") explicit Place(AllocationType type, const std::string& dev_type = "")
: device(0), : device(0),
alloc_type_(type), alloc_type_(type),
device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
// See NOTE [ Why need to temporarily adapt to PlaceType? ] // See NOTE [ Why need to temporarily adapt to PlaceType? ]
Place(paddle::PlaceType type); // NOLINT Place(paddle::PlaceType type); // NOLINT
...@@ -69,7 +82,8 @@ class PADDLE_API Place { ...@@ -69,7 +82,8 @@ class PADDLE_API Place {
alloc_type_ = type; alloc_type_ = type;
device = device_id; device = device_id;
if (!dev_type.empty()) { if (!dev_type.empty()) {
device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type); device_type_id_ = phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(dev_type);
} }
} }
...@@ -78,7 +92,8 @@ class PADDLE_API Place { ...@@ -78,7 +92,8 @@ class PADDLE_API Place {
int8_t GetDeviceId() const { return device; } int8_t GetDeviceId() const { return device; }
std::string GetDeviceType() const { std::string GetDeviceType() const {
return GetGlobalDeviceType(device_type_id_); return phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
} }
std::string DebugString() const; std::string DebugString() const;
......
...@@ -110,6 +110,7 @@ class ArgumentMappingContext { ...@@ -110,6 +110,7 @@ class ArgumentMappingContext {
virtual bool IsSelectedRowsInput(const std::string& name) const = 0; virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
virtual bool IsSelectedRowsInputs(const std::string& name) const = 0; virtual bool IsSelectedRowsInputs(const std::string& name) const = 0;
virtual bool IsSparseCooTensorInput(const std::string& name) const = 0; virtual bool IsSparseCooTensorInput(const std::string& name) const = 0;
virtual bool IsSparseCooTensorOutput(const std::string& name) const = 0;
virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0; virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0;
// For compatibility with LoDTensorArray // For compatibility with LoDTensorArray
virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0; virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
......
...@@ -46,7 +46,8 @@ Backend TransToPhiBackend(const phi::Place& place) { ...@@ -46,7 +46,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
case AllocationType::CUSTOM: case AllocationType::CUSTOM:
return static_cast<Backend>( return static_cast<Backend>(
static_cast<size_t>(Backend::NUM_BACKENDS) + static_cast<size_t>(Backend::NUM_BACKENDS) +
GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType())); phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
default: default:
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Unsupported transform %s to phi Backend.", place)); "Unsupported transform %s to phi Backend.", place));
...@@ -91,7 +92,9 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { ...@@ -91,7 +92,9 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
size_t device_type_id_ = static_cast<size_t>(backend) - size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS); static_cast<size_t>(Backend::NUM_BACKENDS);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_); std::string device_type =
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
if (!device_type.empty()) { if (!device_type.empty()) {
return phi::CustomPlace( return phi::CustomPlace(
device_type, device_type,
......
...@@ -1010,6 +1010,18 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, ...@@ -1010,6 +1010,18 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
#endif #endif
/*
* CUDA Graph related FLAG
* Name: FLAGS_new_executor_use_cuda_graph
* Since Version: 2.4
* Value Range: bool, default=false
* Example: FLAGS_new_executor_use_cuda_graph=true would allow
* new executor to use CUDA Graph.
*/
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
false,
"Use CUDA Graph in new executor");
DEFINE_int32(record_pool_max_size, DEFINE_int32(record_pool_max_size,
2000000, 2000000,
"SlotRecordDataset slot record pool max size"); "SlotRecordDataset slot record pool max size");
...@@ -1181,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); ...@@ -1181,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache, PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
false, false,
"Add a persistent ibuilder."); "Add a persistent ibuilder.");
/**
* mmap_allocator related FLAG
* Name: use_shm_cache
* Since Version: 2.5.0
* Value Range: bool, default=true
* Example:
* Note: . If True, mmap_allocator will cache shm file to decrease munmap
* operation.
*/
PADDLE_DEFINE_EXPORTED_bool(use_shm_cache,
true,
"Use shm cache in mmap_allocator.");
...@@ -101,6 +101,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> { ...@@ -101,6 +101,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout, default_tensor_layout,
default_key.dtype(), default_key.dtype(),
arg_type); arg_type);
} else if (arg_type ==
std::type_index(typeid(const phi::ExtendedTensor&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
} else if (arg_type == std::type_index(typeid( } else if (arg_type == std::type_index(typeid(
const std::vector<const ExtendedTensor*>&))) { const std::vector<const ExtendedTensor*>&))) {
args_def->AppendInput(default_key.backend(), args_def->AppendInput(default_key.backend(),
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册