提交 184fa04c 编写于 作者: W wangruting

fix_conflict

......@@ -10,6 +10,7 @@ English | [简体中文](./README_cn.md)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
[![Twitter](https://img.shields.io/badge/Twitter-1ca0f1.svg?logo=twitter&logoColor=white)](https://twitter.com/PaddlePaddle_)
Welcome to the PaddlePaddle GitHub.
......
......@@ -23,7 +23,7 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
# in case of low internet speed
#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
set(WARPCTC_INCLUDE_DIR
"${WARPCTC_INSTALL_DIR}/include"
......
......@@ -26,7 +26,7 @@ function(find_register FILENAME PATTERN OUTPUT)
PARENT_SCOPE)
endfunction()
function(find_phi_register FILENAME ADD_PATH)
function(find_phi_register FILENAME ADD_PATH PATTERN)
# set op_name to OUTPUT
set(options "")
set(oneValueArgs "")
......@@ -36,11 +36,11 @@ function(find_phi_register FILENAME ADD_PATH)
string(
REGEX
MATCH
"PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
"${PATTERN}\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
register
"${CONTENT}")
if(NOT register STREQUAL "")
string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}")
string(REPLACE "${PATTERN}(" "" register "${register}")
string(REPLACE "," ";" register "${register}")
string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}")
string(REGEX REPLACE "//cuda_only" "" register "${register}")
......@@ -401,7 +401,8 @@ function(op_library TARGET)
# pybind USE_OP_ITSELF
set(op_name "")
# Add PHI Kernel Registry Message
find_phi_register(${cc_src} ${pybind_file})
find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_KERNEL")
find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
find_register(${cc_src} "REGISTER_OPERATOR" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
......@@ -440,7 +441,8 @@ function(op_library TARGET)
foreach(cu_src ${cu_srcs})
set(op_name "")
# Add PHI Kernel Registry Message
find_phi_register(${cu_src} ${pybind_file})
find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL")
find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
......
......@@ -446,7 +446,8 @@ endif()
if(WITH_DISTRIBUTE
AND NOT WITH_PSLIB
AND NOT WITH_PSCORE)
AND NOT WITH_PSCORE
AND NOT WITH_RPC)
include(external/snappy)
list(APPEND third_party_deps extern_snappy)
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/distributed/collective/bkcl_tools.h"
#include "paddle/fluid/distributed/collective/common.h"
#include "paddle/fluid/distributed/collective/utils.h"
#include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/phi/core/device_context.h"
......@@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() {
PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_end());
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
phi::DenseTensor* tensor,
int src_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) {
// numel > 0 indicates the tensor need to be sliced
phi::DenseTensor partial_tensor;
if (numel > 0) {
partial_tensor = GetPartialTensor(*tensor, offset, numel);
tensor = &partial_tensor;
}
return Collective(
tensor,
// have to pass a tensor here
// TODO(zhangxiaoci) catch up with nccl's api
*tensor,
[&](phi::DenseTensor* output,
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
return bkcl_recv(comm,
output->data(),
output->numel(),
src_rank,
platform::ToBKCLDataType(
framework::TransToProtoVarType(output->type())),
stream);
},
CommType::RECV,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
const phi::DenseTensor& tensor,
int dst_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) {
// numel > 0 indicates the tensor need to be sliced
const phi::DenseTensor& tensor_maybe_partial =
numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor;
return Collective(
nullptr,
tensor_maybe_partial,
[&](phi::DenseTensor* output,
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
return bkcl_send(comm,
input.data(),
input.numel(),
dst_rank,
platform::ToBKCLDataType(
framework::TransToProtoVarType(input.type())),
stream);
},
CommType::SEND,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroupBKCL::BKCLTask> ProcessGroupBKCL::CreateTask(
const Place& place,
int rank,
......@@ -136,6 +204,8 @@ void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place,
BKCLContext_t bkcl_comm;
BKCLCHECK(bkcl_init_rank(&bkcl_comm, GetRank(), GetSize(), &bkcl_id));
comm_ctx->SetBkclContext(bkcl_comm);
// comm context creates a separate XPU stream for communication
comm_ctx->CreateStream();
place_to_calc_ctx_[place_key] = calc_ctx;
place_to_comm_ctx_[place_key] = std::move(comm_ctx);
......
......@@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
phi::DeviceContext* GetDeviceContext(const Place& place,
bool use_calc_stream) const override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::shared_ptr<ProcessGroup::Task> AllGather(
phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor,
const AllreduceOptions& opts,
int64_t offset, // for compatibility, no use now
int64_t numel, // for compatibility, no use now
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::shared_ptr<ProcessGroup::Task> AllReduce(
phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor,
const BroadcastOptions& opts,
const AllreduceOptions& opts,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::shared_ptr<ProcessGroup::Task> Broadcast(
phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor,
int64_t offset, // for compatibility, no use now
int64_t numel, // for compatibility, no use now
const BroadcastOptions& opts,
bool sync_op,
bool use_calc_stream) override;
......@@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
int src_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
int dst_rank,
int64_t offset,
int64_t numel,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
......
......@@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() {
// run_per_steps_, run_at_offset_
// 4, 0 --> run at step 0, 4, 8, 12
// 4, 3 --> run at step 3, 7, 11, 15
if ((step_ % run_per_steps_) == run_at_offset_) {
if ((cur_scope_id_ % run_per_steps_) == run_at_offset_) {
ComputeInterceptor::RunOps();
}
}
......@@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() {
void AmplifierInterceptor::SendDataReadyToDownStream() {
// run multi times, send ready one times to downstream, that is
// input multi times, output one times
if (step_ % send_down_per_steps_ == 0) {
if (cur_scope_id_ % send_down_per_steps_ == 0) {
ComputeInterceptor::SendDataReadyToDownStream();
}
}
......@@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() {
void AmplifierInterceptor::ReplyCompletedToUpStream() {
// run multi times, reply one times to upstream, that is
// input one times, output multi times
if (step_ % reply_up_per_steps_ == 0) {
if (cur_scope_id_ % reply_up_per_steps_ == 0) {
ComputeInterceptor::ReplyCompletedToUpStream();
}
}
......
......@@ -21,7 +21,7 @@
namespace paddle {
namespace distributed {
class AmplifierInterceptor : public ComputeInterceptor {
class AmplifierInterceptor final : public ComputeInterceptor {
public:
AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
......
......@@ -71,6 +71,9 @@ void Carrier::Init(
microbatch_scopes_[i] = &minibatch_scope_->NewScope();
CopyParameters(i, program, inference_root_scope_vars);
}
// Add source and sink interceptor id to rank
interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
interceptor_id_to_rank_.emplace(SINK_ID, rank);
// TODO(fleet_exe dev): thread pool
thread_num_ = 1;
......@@ -159,16 +162,10 @@ void Carrier::Start() {
true,
platform::errors::PreconditionNotMet(
"Using carrier before initialized."));
for (int64_t id : source_interceptor_ids_) {
VLOG(3) << "Carrier Start is sending start to source interceptor " << id
<< ".";
InterceptorMessage start_msg;
// source node data_is_ready is send by carrier, so set src_id=-1
start_msg.set_src_id(-1);
start_msg.set_dst_id(id);
start_msg.set_message_type(DATA_IS_READY);
start_msg.set_dst_id(SOURCE_ID);
start_msg.set_message_type(START);
Send(start_msg);
}
// TODO(wangxi): async step
Wait();
dev_ctx_->Wait();
......@@ -270,6 +267,38 @@ void Carrier::CreateInterceptors() {
auto gc = GetGC(place_);
// create source and sink task node
auto max_run_times = microbatch_scopes_.size();
TaskNode* source = new TaskNode(
rank_, SOURCE_ID, max_run_times); // rank, task_id, max_run_times
TaskNode* sink = new TaskNode(rank_, SINK_ID, max_run_times);
// find nodes without upstreams or without downstreams
std::vector<TaskNode*> origin_sources, origin_sinks;
for (const auto& item : interceptor_id_to_node_) {
TaskNode* task_node = item.second;
if (task_node->upstream().empty()) {
origin_sources.emplace_back(task_node);
}
if (task_node->downstream().empty()) {
origin_sinks.emplace_back(task_node);
}
}
// link source node with origin source
for (const auto& node : origin_sources) {
source->AddDownstreamTask(node->task_id(),
std::numeric_limits<int64_t>::max());
node->AddUpstreamTask(SOURCE_ID, std::numeric_limits<int64_t>::max());
}
// link sink node with origin sink
for (const auto& node : origin_sinks) {
sink->AddUpstreamTask(node->task_id(), std::numeric_limits<int64_t>::max());
node->AddDownstreamTask(SINK_ID, std::numeric_limits<int64_t>::max());
}
// create source and sink interceptor
SetInterceptor(SOURCE_ID,
InterceptorFactory::Create("Source", SOURCE_ID, source));
SetInterceptor(SINK_ID, InterceptorFactory::Create("Sink", SINK_ID, sink));
// create each Interceptor
// no auto init since there is no config
for (const auto& item : interceptor_id_to_node_) {
......@@ -303,9 +332,15 @@ void Carrier::CreateInterceptors() {
VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
<< " with type: " << task_node->type() << ".";
if (task_node->upstream().empty()) {
source_interceptor_ids_.emplace_back(interceptor_id);
}
PADDLE_ENFORCE_EQ(
task_node->upstream().empty(),
false,
platform::errors::PreconditionNotMet(
"There should not have normal nodes as source nodes"));
PADDLE_ENFORCE_EQ(task_node->downstream().empty(),
false,
platform::errors::PreconditionNotMet(
"There should not have normal nodes as sink nodes"));
}
}
......
......@@ -100,8 +100,6 @@ class Carrier final {
std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
interceptor_idx_to_interceptor_;
std::vector<int64_t> source_interceptor_ids_;
bool is_init_{false};
std::mutex running_mutex_;
......
......@@ -34,29 +34,10 @@ void ComputeInterceptor::PrepareDeps() {
for (auto up : upstream) {
in_readys_.emplace(up.first, std::make_pair(up.second, 0));
in_stops_.emplace(up.first, false);
}
for (auto down : downstream) {
out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
}
// source compute node, should we add a new SourceInterceptor?
if (upstream.empty()) {
is_source_ = true;
PADDLE_ENFORCE_GT(node_->max_run_times(),
0,
platform::errors::InvalidArgument(
"Source ComputeInterceptor must run at least one "
"times, but now max_run_times=%ld",
node_->max_run_times()));
in_readys_.emplace(-1,
std::make_pair(std::numeric_limits<int64_t>::max(), 0));
}
// If there is no downstream or every downstream is in different rank,
// then this interceptor is the last one for current rank.
// This can be get during init, can be cached for later use.
is_last_ = downstream.empty();
}
void ComputeInterceptor::IncreaseReady(int64_t up_id) {
......@@ -66,12 +47,6 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
platform::errors::NotFound(
"Cannot find upstream=%lld in in_readys.", up_id));
// source node has no upstream, data_is_ready is send by carrier or others
if (is_source_ && up_id == -1) {
it->second.second += GetTaskNode()->max_run_times();
return;
}
auto max_ready_size = it->second.first;
auto ready_size = it->second.second;
ready_size += 1;
......@@ -152,7 +127,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
ready_msg.set_message_type(DATA_IS_READY);
VLOG(3) << "ComputeInterceptor " << interceptor_id_
<< " Send data_is_ready msg to " << down_id
<< " for step: " << step_;
<< " in scope: " << cur_scope_id_;
Send(down_id, ready_msg);
}
}
......@@ -173,8 +148,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
VLOG(3) << "ComputeInterceptor " << interceptor_id_
<< " Reply data_is_useless msg to " << up_id
<< " for step: " << step_;
if (is_source_ && up_id == -1) return;
<< " in scope: " << cur_scope_id_;
InterceptorMessage reply_msg;
reply_msg.set_message_type(DATA_IS_USELESS);
......@@ -183,13 +157,17 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
}
void ComputeInterceptor::RunOps() {
VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
<< step_ + 1 << " time.";
for (auto op : node_->ops()) {
op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
PADDLE_ENFORCE_LT(cur_scope_id_,
microbatch_scopes_.size(),
platform::errors::InvalidArgument(
"Step out of range. There are %ld "
"microbatch_scopes, but recevice scope index %ld",
microbatch_scopes_.size(),
cur_scope_id_));
op->Run(*microbatch_scopes_[cur_scope_id_], place_);
if (gc_) {
framework::DeleteUnusedTensors(
*microbatch_scopes_[step_ % node_->max_run_times()],
framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
op,
node_->unused_vars(),
gc_.get());
......@@ -201,77 +179,28 @@ void ComputeInterceptor::Run() {
while (IsInputReady() && CanWriteOutput()) {
VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
// get the ready scope id from queue
cur_scope_id_ = ready_queue_.front();
ready_queue_.pop();
RunOps();
++step_;
// send to downstream and increase buff used
SendDataReadyToDownStream();
// reply to upstream and decrease ready data
ReplyCompletedToUpStream();
// Try to stop Carrier
if (is_last_ && (step_ % node_->max_run_times() == 0)) {
VLOG(3) << "Interceptor " << GetInterceptorId()
<< " is stopping carrier.";
// FIXME(wangxi): with multi sink interceptor
StopCarrier();
}
}
}
void ComputeInterceptor::ReceivedStop(int64_t up_id) {
received_stop_ = true;
// source node has no upstream, stop is send by carrier or others
if (is_source_ && up_id == -1) return;
auto it = in_stops_.find(up_id);
PADDLE_ENFORCE_NE(it,
in_stops_.end(),
platform::errors::NotFound(
"Cannot find upstream=%lld in in_stops.", up_id));
PADDLE_ENFORCE_EQ(
it->second,
false,
platform::errors::AlreadyExists("Already received stop from %lld, stop "
"cannot be send more than once."));
it->second = true;
}
void ComputeInterceptor::TryStop() {
if (!received_stop_) return;
// can stop only when all upstream is stop and
// downstream complete
for (auto& in_stop : in_stops_) {
if (!in_stop.second) return;
}
for (auto& out_buff : out_buffs_) {
auto used_size = out_buff.second.second;
if (used_size != 0) return;
}
// send stop to downstream
for (auto& out : out_buffs_) {
auto down_id = out.first;
InterceptorMessage stop;
stop.set_message_type(STOP);
Send(down_id, stop);
}
stop_ = true;
}
void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
if (msg.message_type() == DATA_IS_READY) {
IncreaseReady(msg.src_id());
ready_queue_.push(msg.scope_idx());
Run();
} else if (msg.message_type() == DATA_IS_USELESS) {
DecreaseBuff(msg.src_id());
Run();
} else if (msg.message_type() == STOP) {
ReceivedStop(msg.src_id());
}
TryStop();
}
REGISTER_INTERCEPTOR(Compute, ComputeInterceptor);
......
......@@ -14,6 +14,7 @@
#pragma once
#include <queue>
#include <utility>
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
......@@ -30,7 +31,8 @@ class ComputeInterceptor : public Interceptor {
virtual void SendDataReadyToDownStream();
virtual void ReplyCompletedToUpStream();
int64_t step_{0};
std::queue<int64_t> ready_queue_;
int64_t cur_scope_id_;
private:
void PrepareDeps();
......@@ -43,19 +45,10 @@ class ComputeInterceptor : public Interceptor {
void Run();
void Compute(const InterceptorMessage& msg);
void ReceivedStop(int64_t up_id);
void TryStop();
bool is_source_{false};
bool is_last_{false};
// upstream_id-->(max_ready_size, ready_size)
std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
// downstream_id-->(max_buffer_size, used_size)
std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
bool received_stop_{false};
std::map<int64_t, bool> in_stops_{};
};
} // namespace distributed
......
......@@ -93,7 +93,6 @@ class Interceptor {
TaskNode* node_;
// for stop
bool stop_{false};
void StopCarrier();
// for runtime
......@@ -114,9 +113,6 @@ class Interceptor {
std::mutex mutex_;
std::deque<InterceptorMessage> messages_;
int64_t already_run_times_{0};
int64_t used_slot_nums_{0};
};
class InterceptorFactory {
......
......@@ -25,7 +25,7 @@ namespace distributed {
* 1. record the num of micro-step
* 2. check whether to notify carrier the current step is finished
*/
class SinkInterceptor : public Interceptor {
class SinkInterceptor final : public Interceptor {
public:
SinkInterceptor(int64_t interceptor_id, TaskNode* node);
......
......@@ -25,7 +25,7 @@ namespace distributed {
* 1. receive `start` message from carrier
* 2. send num_of_steps `data_is_ready` message to downstream
*/
class SourceInterceptor : public Interceptor {
class SourceInterceptor final : public Interceptor {
public:
SourceInterceptor(int64_t interceptor_id, TaskNode* node);
......
......@@ -25,57 +25,42 @@ limitations under the License. */
namespace paddle {
namespace distributed {
class StartInterceptor : public Interceptor {
public:
StartInterceptor(int64_t interceptor_id, TaskNode* node)
: Interceptor(interceptor_id, node) {
RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); });
}
void NOP(const InterceptorMessage& msg) {
if (msg.message_type() == STOP) {
stop_ = true;
InterceptorMessage stop;
stop.set_message_type(STOP);
Send(1, stop); // stop 1, compute
return;
}
std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
<< std::endl;
}
};
TEST(ComputeInterceptor, Compute) {
std::string carrier_id = "0";
Carrier* carrier =
GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}});
carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}});
MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
// NOTE: don't delete, otherwise interceptor will use undefined node
TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0); // role, rank, task_id
TaskNode* source =
new TaskNode(0, SOURCE_ID, 3); // rank, task_id, max_run_times
TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);
TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
TaskNode* sink = new TaskNode(0, SINK_ID, 3);
// a->b->c
// source->a->b->sink
source->AddDownstreamTask(0);
node_a->AddUpstreamTask(SOURCE_ID);
node_a->AddDownstreamTask(1, 3);
node_b->AddUpstreamTask(0, 3);
node_b->AddDownstreamTask(2);
node_c->AddUpstreamTask(1);
node_b->AddDownstreamTask(SINK_ID);
sink->AddUpstreamTask(1);
Interceptor* a =
carrier->SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a));
carrier->SetInterceptor(
SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
carrier->SetInterceptor(SINK_ID,
InterceptorFactory::Create("Sink", SINK_ID, sink));
// start
InterceptorMessage msg;
msg.set_message_type(DATA_IS_READY);
// test run three times
a->Send(1, msg);
a->Send(1, msg);
a->Send(1, msg);
msg.set_message_type(START);
msg.set_dst_id(SOURCE_ID);
carrier->EnqueueInterceptorMessage(msg);
carrier->Wait();
carrier->Release();
......
......@@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor {
void PingPong(const InterceptorMessage& msg) {
if (msg.message_type() == STOP) {
stop_ = true;
return;
}
std::cout << GetInterceptorId() << " recv msg, count=" << count_
......
......@@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor {
void PingPong(const InterceptorMessage& msg) {
if (msg.message_type() == STOP) {
stop_ = true;
StopCarrier();
return;
}
......
......@@ -22,6 +22,10 @@ if(WITH_ROCM)
target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
endif()
if(WITH_CINN)
target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
endif()
# Prepare file structure
message(
"Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated"
......
......@@ -47,7 +47,7 @@ void SetFeedVariable(Scope* scope,
}
void SetFeedVariable(Scope* scope,
const Strings& input,
const std::vector<std::string>& input,
const std::string& var_name,
size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
......@@ -59,7 +59,7 @@ void SetFeedVariable(Scope* scope,
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index] = input;
feed_inputs[index] = Strings(input);
}
FetchType& GetFetchVariable(const Scope& scope,
......
......@@ -35,7 +35,7 @@ void SetFeedVariable(Scope* scope,
size_t index);
void SetFeedVariable(Scope* scope,
const Strings& input,
const std::vector<std::string>& input,
const std::string& var_name,
size_t index);
......
......@@ -19,12 +19,14 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/phi/core/extended_tensor.h"
namespace paddle {
namespace framework {
using FeedType =
paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>;
using FeedList = std::vector<FeedType>;
using FeedList = paddle::framework::PhiVector<FeedType>;
using FetchType = paddle::variant<phi::DenseTensor,
LoDTensorArray,
......
......@@ -117,6 +117,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
return var_type == proto::VarType::SPARSE_COO;
}
bool IsSparseCooTensorOutput(const std::string& name) const override {
auto var_types = ctx_.GetOutputsVarType(name);
return std::all_of(var_types.begin(),
var_types.end(),
[](const proto::VarType::Type& type) {
return type == proto::VarType::SPARSE_COO;
});
}
bool IsSparseCsrTensorInput(const std::string& name) const override {
auto var_type = ctx_.GetInputVarType(name);
return var_type == proto::VarType::SPARSE_CSR;
......
......@@ -126,6 +126,45 @@ void InferShapeUtilsTestKernel(const Context& dev_ctx,
VLOG(6) << "Come into InferShapeUtilsTestKernel";
}
void TestOutputInferMeta(const phi::MetaTensor& x, phi::MetaTensor* out) {
ASSERT_EQ(x.dtype(), phi::DataType::FLOAT32);
}
class InferShapeUtilsTestOutputOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
AddOutput("Out", "output of test op");
AddComment("This is test op");
}
};
class InferShapeUtilsTestOutputOp : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;
phi::KernelKey GetExpectedKernelType(
const ExecutionContext& ctx) const override {
return phi::KernelKey(proto::VarType::FP32, ctx.GetPlace());
}
};
phi::KernelSignature TestSparseOutputOpArgumentMapping(
const phi::ArgumentMappingContext& ctx) {
if (ctx.IsSparseCooTensorOutput("Out")) {
return phi::KernelSignature(
"test_sparse_coo_tensor_output", {"X"}, {}, {"Out"});
}
return phi::KernelSignature("test_output", {"X"}, {}, {"Out"});
}
template <typename T, typename Context>
void InferShapeUtilsTestOutputKernel(const Context& dev_ctx,
const phi::DenseTensor& x,
phi::SparseCooTensor* out) {
VLOG(6) << "Come into InferShapeUtilsTestOutputKernel";
}
} // namespace framework
} // namespace paddle
......@@ -143,6 +182,21 @@ PD_REGISTER_KERNEL(infer_shape_utils_test,
paddle::framework::InferShapeUtilsTestKernel,
int) {}
DECLARE_INFER_SHAPE_FUNCTOR(
infer_shape_utils_test_output,
InferShapeUtilsTestOutputInferShapeFunctor,
PD_INFER_META(paddle::framework::TestOutputInferMeta));
REGISTER_OPERATOR(infer_shape_utils_test_output,
paddle::framework::InferShapeUtilsTestOutputOp,
paddle::framework::InferShapeUtilsTestOutputOpMaker,
InferShapeUtilsTestOutputInferShapeFunctor);
PD_REGISTER_KERNEL(test_sparse_coo_tensor_output,
CPU,
ALL_LAYOUT,
paddle::framework::InferShapeUtilsTestOutputKernel,
int) {}
TEST(InferShapeUtilsTest, ALL) {
paddle::framework::ProgramDesc prog;
paddle::framework::proto::BlockDesc proto_block;
......@@ -200,3 +254,27 @@ TEST(InferShapeUtilsTest, ALL) {
op->InferShape(block_desc);
}
TEST(InferShapeUtilsTestOutput, ALL) {
paddle::framework::ProgramDesc prog;
paddle::framework::proto::BlockDesc proto_block;
paddle::framework::BlockDesc block_desc(&prog, &proto_block);
auto* op = block_desc.AppendOp();
op->SetType("infer_shape_utils_test_output");
auto* x = block_desc.Var("x");
x->SetType(paddle::framework::proto::VarType::LOD_TENSOR);
x->SetDataType(paddle::framework::proto::VarType::FP32);
op->SetInput("X", {"x"});
auto* out = block_desc.Var("out");
out->SetType(paddle::framework::proto::VarType::SPARSE_COO);
op->SetOutput("Out", {"out"});
phi::OpUtilsMap::Instance().InsertArgumentMappingFn(
"infer_shape_utils_test_output",
paddle::framework::TestSparseOutputOpArgumentMapping);
op->InferShape(block_desc);
}
......@@ -460,14 +460,6 @@ if(WITH_MKLDNN)
test_cpu_quantize_squash_pass
SRCS mkldnn/cpu_quantize_squash_pass_tester.cc
DEPS cpu_quantize_squash_pass naive_executor)
cc_test(
test_reshape_transpose_matmul_mkldnn_fuse_pass
SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
cc_test(
test_matmul_transpose_reshape_fuse_pass
SRCS mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc
DEPS matmul_transpose_reshape_mkldnn_fuse_pass)
cc_test(
test_shuffle_channel_mkldnn_detect_pass
SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc *prog,
const std::string &type,
const std::vector<std::string> &inputs,
const std::vector<std::string> &outputs) {
auto *op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetInput("X", {inputs[0]});
op->SetOutput("Out", {outputs[0]});
if (type == "transpose2") {
op->SetAttr("axis", std::vector<int>({0, 2, 1, 3}));
op->SetOutput("XShape", {outputs[1]});
}
if (type == "reshape2") {
op->SetAttr("shape", std::vector<int>({4, 5, 6}));
op->SetOutput("XShape", {outputs[1]});
}
if (type == "matmul") {
op->SetInput("Y", {inputs[1]});
op->SetAttr("use_mkldnn", true);
op->SetAttr("alpha", 1.0f);
op->SetAttr("transpose_X", true);
op->SetAttr("transpose_Y", true);
}
if (type == "matmul_v2") {
op->SetInput("Y", {inputs[1]});
op->SetAttr("use_mkldnn", true);
op->SetAttr("trans_x", true);
op->SetAttr("trans_y", true);
}
}
ProgramDesc BuildProgramDesc(const std::string &op_name) {
ProgramDesc prog;
for (auto &v : std::initializer_list<std::string>(
{"a1", "a2", "b", "c", "cx", "d", "dx", "e"})) {
auto *var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
}
SetOp(&prog, op_name, {"a1", "a2"}, {"b"});
SetOp(&prog, "transpose2", {"b"}, {"c", "cx"});
SetOp(&prog, "reshape2", {"c"}, {"d", "dx"});
SetOp(&prog, "fc", {"d"}, {"e"});
return prog;
}
void MainTest(const ProgramDesc &prog, const std::string &op_name) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int original_nodes_num = graph->Nodes().size();
auto pass =
PassRegistry::Instance().Get("matmul_transpose_reshape_mkldnn_fuse_pass");
graph.reset(pass->Apply(graph.release()));
int current_nodes_num = graph->Nodes().size();
EXPECT_EQ(original_nodes_num - 6, current_nodes_num);
for (auto *node : graph->Nodes()) {
if (node->IsOp()) {
auto *op = node->Op();
if (op->Type() == op_name) {
EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_reshape_Out"),
std::vector<int>({4, 5, 6}));
EXPECT_EQ(op->GetAttrIfExists<std::vector<int>>("fused_transpose_Out"),
std::vector<int>({0, 2, 1, 3}));
}
}
}
}
TEST(MatmulTransposeReshapeFusePass, matmul_fuse_pass) {
auto prog = BuildProgramDesc("matmul");
MainTest(prog, "matmul");
}
TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) {
auto prog = BuildProgramDesc("matmul_v2");
MainTest(prog, "matmul_v2");
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(matmul_transpose_reshape_mkldnn_fuse_pass);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
void AddVarToScope(Scope* param_scope,
const std::string& name,
const DDim& dims) {
auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
tensor->Resize(dims);
tensor->mutable_data<float>(phi::CPUPlace());
}
Scope* CreateParamScope() {
auto param_scope = new Scope();
AddVarToScope(param_scope, "w1", {768, 768});
AddVarToScope(param_scope, "bias1", {768});
AddVarToScope(param_scope, "w2", {768, 768});
AddVarToScope(param_scope, "bias2", {768});
return param_scope;
}
void TestMain(const std::string& op_name, bool with_xshapes) {
// inputs operator output
// -----------------------------------------------
// a1,w1,bias1 fc -> b1
// b1 reshape -> c1
// c1 transpose -> d1
// a2,w2,bias2 fc -> b2
// b2 reshape -> c2
// c2 transpose -> d2
// (d1, d2) matmul(_v2) -> (...)
Layers layers;
auto* a1 = layers.data("a1", {-1, 128, 768});
auto* w1 = layers.data("w1", {768, 768}, true);
auto* bias1 = layers.data("bias1", {768}, true);
auto* b1 = layers.fc(a1, w1, bias1, 2);
b1->SetShape({-1, 128, 768});
auto* c1 = layers.reshape2(b1, {0, 0, 12, 64}, with_xshapes);
c1->SetShape({-1, 128, 12, 64});
auto* d1 = layers.transpose2(c1, {0, 2, 1, 3}, with_xshapes);
d1->SetShape({-1, 12, 128, 64});
auto* a2 = layers.data("a2", {-1, 128, 768});
auto* w2 = layers.data("w2", {768, 768}, true);
auto* bias2 = layers.data("bias2", {768}, true);
auto* b2 = layers.fc(a2, w2, bias2, 2);
b2->SetShape({-1, 128, 768});
auto* c2 = layers.reshape2(b2, {0, 0, 12, 64});
c2->SetShape({-1, 128, 12, 64});
auto* d2 = layers.transpose2(c2, {0, 2, 1, 3});
d2->SetShape({-1, 12, 128, 64});
if (op_name == "matmul_v2") {
layers.matmul_v2(d1, d2);
} else {
layers.matmul(d1, d2);
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
graph->Set("__param_scope__", CreateParamScope());
int num_reshape_nodes_before = GetNumOpNodes(graph, "reshape2");
int num_transpose_nodes_before = GetNumOpNodes(graph, "transpose2");
int total_nodes_before = graph->Nodes().size();
VLOG(3) << DebugString(graph);
auto pass =
PassRegistry::Instance().Get("reshape_transpose_matmul_mkldnn_fuse_pass");
graph.reset(pass->Apply(graph.release()));
int num_reshape_nodes_after = GetNumOpNodes(graph, "reshape2");
int num_transpose_nodes_after = GetNumOpNodes(graph, "transpose2");
int total_nodes_after = graph->Nodes().size();
VLOG(3) << DebugString(graph);
EXPECT_EQ(num_reshape_nodes_before, 2);
EXPECT_EQ(num_reshape_nodes_after, 0);
EXPECT_EQ(num_transpose_nodes_before, 2);
EXPECT_EQ(num_transpose_nodes_after, 0);
int removed = 8; // 2* reshape, reshape_out, transpose, transpose_out
if (with_xshapes) removed += 2; // transpose_xshape, reshape_xshape
EXPECT_EQ(total_nodes_before - removed, total_nodes_after);
auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op();
auto check = [&matmul_op_desc](std::string a) {
std::string shape_str = "fused_reshape_" + a;
auto shape = matmul_op_desc->GetAttrIfExists<std::vector<int>>(shape_str);
EXPECT_EQ(shape, (std::vector<int>{0, 0, 12, 64}));
std::string axis_str = "fused_transpose_" + a;
auto axis = matmul_op_desc->GetAttrIfExists<std::vector<int>>(axis_str);
EXPECT_EQ(axis, (std::vector<int>{0, 2, 1, 3}));
};
check("X");
check("Y");
}
TEST(ReshapeTransposeMatmulMkldnnFusePass,
both_matmul_inputs_reshape_transpose) {
TestMain("matmul", false);
}
TEST(ReshapeTransposeMatmulMkldnnFusePass,
both_matmul_inputs_reshape_transpose_one_with_xshapes) {
TestMain("matmul", true);
}
TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
both_matmulv2_inputs_reshape_transpose) {
TestMain("matmul_v2", false);
}
TEST(ReshapeTransposeMatmulV2MkldnnFusePass,
both_matmulv2_inputs_reshape_transpose_one_with_xshapes) {
TestMain("matmul_v2", true);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(reshape_transpose_matmul_mkldnn_fuse_pass);
......@@ -19,14 +19,27 @@
#include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h"
DECLARE_bool(fast_eager_deletion_mode);
DECLARE_bool(new_executor_use_cuda_graph);
namespace paddle {
namespace framework {
bool IsInterpretercoreFastGCEnabled() {
return memory::allocation::AllocatorFacade::Instance()
// When using cuda graph, fast GC must be used. Because
// `EventQuery` method in event GC cannot be used in
// cuda graph.
PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() == false &&
FLAGS_new_executor_use_cuda_graph,
false,
platform::errors::InvalidArgument(
"When FLAGS_new_executor_use_cuda_graph is true, "
"IsStreamSafeCUDAAllocatorUsed must be true, but "
"got false."));
return (memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() &&
FLAGS_fast_eager_deletion_mode;
FLAGS_fast_eager_deletion_mode) ||
FLAGS_new_executor_use_cuda_graph;
}
InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {
......
......@@ -31,6 +31,7 @@
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/phi/backends/device_manager.h"
PADDLE_DEFINE_EXPORTED_bool(
......@@ -50,6 +51,10 @@ PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
DECLARE_bool(check_nan_inf);
DECLARE_bool(benchmark);
DECLARE_bool(new_executor_use_cuda_graph);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_bool(sync_nccl_allreduce);
#endif
constexpr const char* kExceptionCaught = "ExceptionCaught";
constexpr const char* kTaskCompletion = "TaskCompletion";
......@@ -142,6 +147,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
}
return lhs_prority > rhs_prority;
};
PrepareForCUDAGraphCapture();
}
InterpreterCore::~InterpreterCore() {
......@@ -161,6 +168,7 @@ interpreter::CostInfo InterpreterCore::DryRun(
const std::vector<std::string>& feed_names,
const std::vector<phi::DenseTensor>& feed_tensors) {
SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);
Prepare(feed_names, feed_tensors, true);
interpreter::CostInfo cost_info;
......@@ -221,6 +229,7 @@ paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names,
const std::vector<phi::DenseTensor>& feed_tensors) {
SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);
#ifdef PADDLE_WITH_MKLDNN
platform::AttachPointerHashToMKLDNNKey(this, place_);
......@@ -240,7 +249,16 @@ paddle::framework::FetchList InterpreterCore::Run(
// return Fetch Tensors
auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
if (fetch_var) {
return std::move(*fetch_var->GetMutable<framework::FetchList>());
auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_list.empty(),
true,
platform::errors::InvalidArgument(
"Cannot fetch data when using CUDA Graph."));
}
#endif
return fetch_list;
} else {
return {};
}
......@@ -249,6 +267,7 @@ paddle::framework::FetchList InterpreterCore::Run(
paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names, bool need_fetch) {
SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);
#ifdef PADDLE_WITH_MKLDNN
platform::AttachPointerHashToMKLDNNKey(this, place_);
......@@ -290,7 +309,16 @@ paddle::framework::FetchList InterpreterCore::Run(
HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
if (fetch_var && need_fetch) {
return std::move(*fetch_var->GetMutable<framework::FetchList>());
auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_list.empty(),
true,
platform::errors::InvalidArgument(
"Cannot fetch data when using CUDA Graph."));
}
#endif
return fetch_list;
} else {
return {};
}
......@@ -504,6 +532,67 @@ void InterpreterCore::BuildInplace() {
}
}
void InterpreterCore::PrepareForCUDAGraphCapture() {
if (!FLAGS_new_executor_use_cuda_graph) return;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_EQ(
platform::IsCUDAGraphCapturing(),
false,
platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
"when running the first batch."));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
true,
platform::errors::InvalidArgument(
"CUDA Graph is only supported on NVIDIA GPU device."));
// If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
// which may cause error in cuda graph. This behavior is consistent with PE.
PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
false,
platform::errors::InvalidArgument(
"FLAGS_sync_nccl_allreduce must be False to support "
"CUDA Graph capturing."));
// All output vars of coalesce_tensor op should not be gc.
// If fused output var of coalesce_tensor is gc, it will cause accuracy
// problem. The specific reasons need to be analyzed.
for (auto& op_desc : block_.AllOps()) {
if (op_desc->Type() == kCoalesceTensor) {
for (auto& out_var_name : op_desc->OutputArgumentNames()) {
execution_config_.skip_gc_vars.insert(out_var_name);
VLOG(4) << "Insert Var(" << out_var_name << ") into skip_gc_vars.";
}
}
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDA Graph is only supported on NVIDIA GPU device."));
#endif
}
void InterpreterCore::CheckCUDAGraphBeforeRun(
const std::vector<std::string>& feed_names) {
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(
feed_names.empty(),
true,
platform::errors::InvalidArgument(
"Feeding data is not permitted when capturing CUDA Graph."));
PADDLE_ENFORCE_EQ(
FLAGS_new_executor_use_cuda_graph,
true,
platform::errors::InvalidArgument(
"You must turn on FLAGS_new_executor_use_cuda_graph to True "
"to enable CUDA Graph capturing."));
PADDLE_ENFORCE_EQ(
place_,
platform::CUDAGraphCapturingPlace(),
platform::errors::InvalidArgument("The place to capture CUDAGraph is "
"not the same as the place to run."));
}
#endif
}
void InterpreterCore::BuildOperatorDependences() {
// analysis the dependences between ops, add next_instr_list to each instr,
// and set the dependecy_count_
......
......@@ -97,6 +97,10 @@ class InterpreterCore {
const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
// cuda graph
void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
void PrepareForCUDAGraphCapture();
// execution
void RunImpl();
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
......
......@@ -1561,6 +1561,63 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
this->Info().infer_shape_(&infer_shape_ctx);
}
template <typename T>
bool HasSameTensorType(phi::TensorBase* phi_tensor, Variable* var) {
if (phi_tensor == nullptr && var == nullptr) {
return true;
} else if (phi_tensor != nullptr && var != nullptr) {
if (T::classof(phi_tensor) && var->IsType<T>()) {
return true;
}
}
return false;
}
// TODO(YuanRisheng): We need collect all `need_prepare_phi_data_`
// into this function.
void OperatorWithKernel::CheckWhetherPreparePhiData(
const VariableNameMap& innames,
const VariableNameMap& outnames,
const Scope& scope) const {
if (run_phi_kernel_ && impl_ != nullptr) {
const auto& phi_kernel_context = impl_->getKernelContext();
size_t phi_tensor_index = 0;
// Check each tensor in KernelContext, if there is a tensor that has
// different type with variable. The PhiKernelContext need be reconstructed.
// We use kernel_signature_'s output to retrieve tensor. Because the tensor
// in phi_kernel_context stored in the order of kernel_signature_'s output.
if (phi_kernel_context->OutputsSize() >= phi_tensor_index ||
kernel_signature_ == nullptr) {
need_prepare_phi_data_ = true;
return;
}
const auto& phi_output_names = kernel_signature_->output_names;
for (auto& phi_output_name : phi_output_names) {
const auto& iter = outnames.find(phi_output_name);
if (iter != outnames.end()) {
for (auto& var_name : iter->second) {
auto var_output = scope.FindVar(var_name);
auto phi_output =
phi_kernel_context->MutableOutputAt<phi::TensorBase>(
phi_tensor_index);
if (phi_output == nullptr) {
continue;
}
if (!(HasSameTensorType<phi::DenseTensor>(phi_output, var_output) ||
HasSameTensorType<phi::SparseCooTensor>(phi_output,
var_output) ||
HasSameTensorType<framework::Strings>(phi_output,
var_output))) {
need_prepare_phi_data_ = true;
}
phi_tensor_index++;
}
}
}
}
}
void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const {
// To reduce the elapsed time of HasAttr, we use bool variable to record the
......@@ -1571,6 +1628,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
HasAttr(kAllKernelsMustComputeRuntimeShape))
all_kernels_must_compute_runtime_shape_ = true;
const Scope* cur_scope = &scope;
CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
if (!enable_cache_runtime_context_) {
RuntimeContext ctx(Inputs(), Outputs(), scope);
RunImpl(scope, place, &ctx);
......@@ -2993,7 +3051,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
"to the size of kernel attribute_defs (%d).",
attr_names.size(),
attr_defs.size()));
for (size_t i = 0; i < input_names.size(); ++i) {
auto it = ctx.inputs.find(input_names[i]);
......@@ -3037,6 +3094,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
} else if (var->IsType<framework::Vocab>()) {
tensor_in = &(var->Get<framework::Vocab>());
phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
} else if (var->IsType<framework::FeedList>()) {
tensor_in = &(var->Get<framework::FeedList>());
phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported input `%s` type when call pt kernel.",
......@@ -3047,7 +3107,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
phi_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
}
VLOG(4) << "Done inputs";
for (size_t i = 0; i < output_names.size(); ++i) {
auto it = ctx.outputs.find(output_names[i]);
size_t start_idx =
......@@ -3087,6 +3146,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
// Note: If the input LoDTensorArray size is 0, the output
// LoDTensorArray is also 0
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} else if (var->template IsType<framework::Strings>()) {
tensor_out = var->template GetMutable<framework::Strings>();
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} else if (var->template IsType<paddle::framework::RawTensor>()) {
tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
......@@ -3108,7 +3170,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
i);
}
VLOG(4) << "Done outputs";
for (size_t i = 0; i < attr_names.size(); ++i) {
VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": "
<< attr_defs[i].type_index;
......
......@@ -550,6 +550,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
return var->IsType<phi::SparseCooTensor>();
}
bool IsSparseCooTensorOutput(const std::string& name) const override {
auto vars = ctx_.MultiOutputVar(name);
return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
return var->IsType<phi::SparseCooTensor>();
});
}
bool IsSparseCsrTensorInput(const std::string& name) const override {
const auto* var = ctx_.InputVar(name);
return var->IsType<phi::SparseCsrTensor>();
......@@ -746,6 +753,10 @@ class OperatorWithKernel : public OperatorBase {
RuntimeContext* ctx,
const phi::Place& place) const;
void CheckWhetherPreparePhiData(const VariableNameMap& innames,
const VariableNameMap& outnames,
const Scope& scope) const;
void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const;
......
......@@ -60,6 +60,7 @@ if(WITH_TESTING)
elementwise_add_op
generated_op)
set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES})
cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS
transform_desc)
......
......@@ -23,8 +23,8 @@ namespace paddle {
namespace framework {
/// \brief Fluid Kernel and PHI Kernel will be unified in the future.
/// So, we need a class in PHI that can represent the RAW type in Fluid.
/// The RawTensor is for PHI Kernel that has RAW type arguments.
/// So, we need a class in PHI that can represent the RawTensor type in Fluid.
/// The RawTensor is for PHI Kernel that has RawTensor type arguments.
class RawTensor : public phi::ExtendedTensor,
public phi::TypeInfoTraits<phi::TensorBase, RawTensor> {
public:
......@@ -37,13 +37,35 @@ class RawTensor : public phi::ExtendedTensor,
RawTensor& operator=(RawTensor&& other) = default;
/// \brief Destroy the RawTensor and release exclusive resources.
virtual ~RawTensor() = default;
virtual ~RawTensor() {
if (!data_.empty()) {
data_deleter_();
}
}
public:
/// \brief Returns the name of the class for type traits.
/// \return The name of the class.
static const char* name() { return "RawTensor"; }
template <typename T>
T& Get() const {
PADDLE_ENFORCE_EQ(data_.empty(),
false,
platform::errors::PreconditionNotMet(
"The data in RawTensor is empty. Please set data "
"before using it."));
try {
return *(paddle::any_cast<T*>(data_));
} catch (paddle::bad_any_cast&) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Invalid data type error, expected %s, actual %s.",
typeid(T).name(),
data_type_.name()));
}
}
template <typename T>
T* GetMutable() {
if (!data_.empty()) {
......@@ -70,7 +92,7 @@ class RawTensor : public phi::ExtendedTensor,
private:
paddle::any data_;
std::function<void(void)> data_deleter_;
std::function<void(void)> data_deleter_ = []() {};
std::type_index data_type_ = std::type_index(typeid(void));
};
......
......@@ -25,6 +25,10 @@ limitations under the License. */
namespace paddle {
namespace framework {
// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't
// recommend widely use it. Because faster_tokenizer_op may be deleted in the
// future and this class will be deleted.
class Vocab : public phi::ExtendedTensor,
public phi::TypeInfoTraits<phi::TensorBase, Vocab> {
public:
......@@ -94,8 +98,73 @@ class Vocab : public phi::ExtendedTensor,
std::unordered_map<std::wstring, std::int32_t> data_;
};
// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI
// Kernel. It can be used when you define a non-tensor type that needs to be
// stored in a vector as PHI kernel argument.
template <typename T>
class PhiVector : public phi::ExtendedTensor,
public phi::TypeInfoTraits<phi::TensorBase, PhiVector<T>> {
public:
PhiVector() = default;
explicit PhiVector(const std::vector<T>& init_data) : data_(init_data) {}
PhiVector(PhiVector&& other) = default;
PhiVector(const PhiVector& other) = default;
PhiVector& operator=(const PhiVector& other) = default;
PhiVector& operator=(const std::vector<T>& other) {
data_ = other;
return *this;
}
PhiVector& operator=(PhiVector&& other) = default;
/// \brief Destroy the PhiVector and release exclusive resources.
virtual ~PhiVector() = default;
public:
/// \brief Returns the name of the class for type traits.
/// \return The name of the class.
static const char* name() {
return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str();
}
size_t size() const { return data_.size(); }
void resize(size_t size) { data_.resize(size); }
void clear() { data_.clear(); }
void emplace_back(const T& feed_data) { data_.emplace_back(feed_data); }
const T& operator[](size_t index) const { return data_[index]; }
T& operator[](size_t index) { return data_[index]; }
T& at(size_t index) { return data_.at(index); }
const T& at(size_t index) const { return data_.at(index); }
typename std::vector<T>::iterator begin() { return data_.begin(); }
typename std::vector<T>::const_iterator begin() const {
return data_.begin();
}
typename std::vector<T>::iterator end() { return data_.end(); }
typename std::vector<T>::const_iterator end() const { return data_.end(); }
private:
std::vector<T> data_;
};
using String = std::string;
using Strings = std::vector<std::string>;
using Strings = PhiVector<std::string>;
// Convert the std::string type to the std::string type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res);
......
......@@ -221,6 +221,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
Vocab,
std::vector<int>,
std::vector<float>,
std::vector<std::string>,
RawTensor>;
template <typename T>
struct VarTypeTrait {
......
......@@ -1655,7 +1655,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
auto custom_place = place_;
auto paddleplace = static_cast<PaddlePlace>(
static_cast<size_t>(PaddlePlace::kCUSTOM) +
phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
res->SetPlace(paddleplace, custom_place.GetDeviceId());
} else {
auto gpu_place = place_;
......@@ -1710,7 +1711,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
auto custom_place = place_;
auto paddleplace = static_cast<PaddlePlace>(
static_cast<size_t>(PaddlePlace::kCUSTOM) +
phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
res->SetPlace(paddleplace, custom_place.GetDeviceId());
} else {
auto gpu_place = place_;
......
......@@ -25,13 +25,16 @@ if(WITH_ONNXRUNTIME)
cc_library(
zero_copy_tensor_dummy
SRCS zero_copy_tensor_dummy.cc
DEPS onnxruntime)
DEPS onnxruntime phi_enforce)
else()
cc_library(
zero_copy_tensor
SRCS zero_copy_tensor.cc
DEPS scope lod_tensor enforce)
cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
cc_library(
zero_copy_tensor_dummy
SRCS zero_copy_tensor_dummy.cc
DEPS phi_enforce)
endif()
cc_test(
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
......@@ -76,7 +77,8 @@ void Tensor::ReshapeStrings(const size_t &shape) {
var,
paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
paddle::framework::Strings *tensor =
var->GetMutable<paddle::framework::Strings>();
tensor->resize(shape);
}
......@@ -261,7 +263,9 @@ void Tensor::CopyFromCpu(const T *data) {
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::CustomPlace custom_place(
phi::GetGlobalDeviceType(device_type_id), device_);
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id),
device_);
auto *t_data = tensor->mutable_data<T>(custom_place);
auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
pool.Get(custom_place));
......@@ -354,7 +358,7 @@ void Tensor::ShareExternalData(const T *data,
}
void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
EAGER_GET_TENSOR(paddle_infer::Strings);
EAGER_GET_TENSOR(paddle::framework::Strings);
PADDLE_ENFORCE_GE(tensor->size(),
0,
paddle::platform::errors::PreconditionNotMet(
......
......@@ -112,6 +112,12 @@ bool PluginArgumentMappingContext::IsSparseCooTensorInput(
const std::string& name) const {
return false;
}
bool PluginArgumentMappingContext::IsSparseCooTensorOutput(
const std::string& name) const {
return false;
}
bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
const std::string& name) const {
return false;
......
......@@ -56,6 +56,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
bool IsDenseTensorOutput(const std::string& name) const override;
bool IsSparseCooTensorOutput(const std::string& name) const override;
bool IsSelectedRowsOutput(const std::string& name) const override;
bool IsForInferShape() const override { return false; }
......
......@@ -124,6 +124,7 @@ TEST(ArgMappingContexTest, BasicFunction) {
EXPECT_EQ(context.IsDenseTensorOutput("Out"), false);
EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false);
EXPECT_EQ(context.IsSparseCooTensorOutput("Out"), false);
EXPECT_EQ(context.IsForInferShape(), false);
}
......
......@@ -26,6 +26,8 @@
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
DECLARE_bool(use_shm_cache);
namespace paddle {
namespace memory {
namespace allocation {
......@@ -111,20 +113,33 @@ void AllocateMemoryMap(
std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename,
int flags,
size_t size) {
size_t size,
int buffer_id) {
int fd = -1;
void *base_ptr = nullptr;
if (buffer_id == -1) {
AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
VLOG(4) << "Create and mmap a new shm: " << filename;
} else {
base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_;
VLOG(4) << "Get a cached shm " << filename;
}
void *aliged_base_ptr =
static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
return std::make_shared<RefcountedMemoryMapAllocation>(
aliged_base_ptr, size, filename, flags, fd);
aliged_base_ptr, size, filename, flags, fd, buffer_id);
}
RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
void *ptr, size_t size, std::string ipc_name, int fd, int flags)
void *ptr,
size_t size,
std::string ipc_name,
int fd,
int flags,
int buffer_id)
: MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
// must reset base ptr first.
buffer_id_ = buffer_id;
resetBaseptr();
initializeRefercount();
}
......@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() {
}
void RefcountedMemoryMapAllocation::close() {
VLOG(4) << "Close a RefcountedMemoryMapAllocation: " << ipc_name_;
if (closed_) {
return;
}
closed_ = true;
void *data = map_ptr_;
CountInfo *info = reinterpret_cast<CountInfo *>(data);
if (--info->refcount == 0) {
--info->refcount;
if (FLAGS_use_shm_cache && buffer_id_ != -1) {
return;
} else {
if (FLAGS_use_shm_cache &&
MemoryMapAllocationPool::Instance().BufferSize() <
static_cast<size_t>(
MemoryMapAllocationPool::Instance().MaxPoolSize())) {
MemoryMapAllocationPool::Instance().Insert(MemoryMapInfo(
flags_, map_size_ - mmap_alignment, ipc_name_, map_ptr_));
} else {
if (info->refcount == 0 &&
shm_open(ipc_name_.c_str(), O_RDWR, (mode_t)0600) != -1) {
shm_unlink(ipc_name_.c_str());
VLOG(6) << "shm_unlink file: " << ipc_name_;
}
PADDLE_ENFORCE_NE(
munmap(map_ptr_, map_size_),
PADDLE_ENFORCE_NE(munmap(map_ptr_, map_size_),
-1,
platform::errors::Unavailable("could not unmap the shared memory file: ",
platform::errors::Unavailable(
"could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
}
}
}
MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
......@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() {
MemoryMapFdSet::~MemoryMapFdSet() { Clear(); }
MemoryMapAllocationPool *MemoryMapAllocationPool::pool_ = nullptr;
void MemoryMapAllocationPool::Insert(const MemoryMapInfo &memory_map) {
std::lock_guard<std::mutex> guard(mtx_);
memory_map_allocations_.push_back(memory_map);
VLOG(4) << this << "Intsert a new shm: " << memory_map.file_name_;
}
int MemoryMapAllocationPool::FindFromCache(const int &flag,
const size_t &data_size,
const std::string &file_name,
bool check_refcount) {
std::lock_guard<std::mutex> guard(mtx_);
for (size_t idx = 0; idx < memory_map_allocations_.size(); idx++) {
if (memory_map_allocations_.at(idx).flags_ == flag &&
memory_map_allocations_.at(idx).data_size_ == data_size) {
if (file_name == "" ||
memory_map_allocations_.at(idx).file_name_ == file_name) {
if (!check_refcount || reinterpret_cast<CountInfo *>(
memory_map_allocations_.at(idx).mmap_ptr_)
->refcount == 0) {
VLOG(4) << "Match at: " << idx;
return idx;
}
}
}
}
return -1;
}
const MemoryMapInfo &MemoryMapAllocationPool::GetById(int id) {
std::lock_guard<std::mutex> guard(mtx_);
return memory_map_allocations_.at(id);
}
void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) {
max_pool_size_ = size;
VLOG(4) << this << "Set max pool size is: " << max_pool_size_;
}
void MemoryMapAllocationPool::Clear() {
std::lock_guard<std::mutex> guard(mtx_);
for (auto mmap : memory_map_allocations_) {
int rlt = shm_unlink(mmap.file_name_.c_str());
if (rlt == 0) {
VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_;
}
PADDLE_ENFORCE_NE(munmap(mmap.mmap_ptr_, mmap.data_size_ + mmap_alignment),
-1,
platform::errors::Unavailable(
"could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
}
memory_map_allocations_.clear();
}
MemoryMapAllocationPool::~MemoryMapAllocationPool() { Clear(); }
} // namespace allocation
} // namespace memory
} // namespace paddle
......
......@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation {
class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
public:
RefcountedMemoryMapAllocation(
void *ptr, size_t size, std::string ipc_name, int flags, int fd);
RefcountedMemoryMapAllocation(void *ptr,
size_t size,
std::string ipc_name,
int flags,
int fd,
int buffer_id = -1);
void incref();
int decref();
......@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
virtual ~RefcountedMemoryMapAllocation() { close(); }
protected:
int buffer_id_ = -1;
void initializeRefercount();
void resetBaseptr();
};
......@@ -94,7 +99,8 @@ void AllocateMemoryMap(
std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename,
int flags,
size_t size);
size_t size,
int buffer_id = -1);
class MemoryMapWriterAllocation : public Allocation {
public:
......@@ -153,6 +159,68 @@ class MemoryMapFdSet {
std::mutex mtx_;
};
class MemoryMapInfo {
public:
explicit MemoryMapInfo(int flags,
size_t data_size,
std::string file_name,
void *mmap_ptr)
: flags_(flags),
data_size_(data_size),
file_name_(file_name),
mmap_ptr_(mmap_ptr) {}
int flags_ = 0;
size_t data_size_ = 0;
std::string file_name_;
void *mmap_ptr_ = nullptr;
};
/* Note(zhangbo):
MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in
dataloader. The munmap(shm_mmap_ptr) instruction in
RefcountedMemoryMapAllocation::close() function may block other threads of the
process. Therefore, the logic of shm cache and reuse is designed: the shm
created by the _share_filename process will be cached and reused according to
the data_size of shm, thus eliminating the problem of munmap blocking other
threads
*/
class MemoryMapAllocationPool {
public:
static MemoryMapAllocationPool &Instance() {
if (pool_ == nullptr) {
pool_ = new MemoryMapAllocationPool();
}
return *pool_;
}
void Insert(const MemoryMapInfo &memory_map);
int FindFromCache(const int &flag,
const size_t &data_size,
const std::string &file_name = "",
bool check_refcount = true);
const MemoryMapInfo &GetById(int id);
size_t BufferSize() { return memory_map_allocations_.size(); }
void Clear();
void SetMaxPoolSize(const int &size);
int MaxPoolSize() { return max_pool_size_; }
~MemoryMapAllocationPool();
private:
MemoryMapAllocationPool() = default;
static MemoryMapAllocationPool *pool_;
std::vector<MemoryMapInfo> memory_map_allocations_;
int max_pool_size_ = 0;
std::mutex mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......
......@@ -44,6 +44,7 @@ if(WITH_TESTING)
cinn_launch_context
cinn_instruction_run_op
cinn)
target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
"RUN_TYPE=CINN")
......@@ -73,6 +74,7 @@ if(WITH_TESTING)
cinn_launch_op
cinn_instruction_run_op
elementwise_add_op)
target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
set_tests_properties(
cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
"${CINN_RUN_ENVIRONMENT}")
......
......@@ -11,6 +11,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/raw_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
......@@ -28,91 +30,8 @@ class OpBase;
namespace paddle {
namespace operators {
// FeedVariableVisitor is to feed the variable data
// according to data type (phi::DenseTensor or Strings).
class FeedVariableVisitor {
public:
explicit FeedVariableVisitor(framework::Variable *out_var,
const platform::Place &place)
: out_var_(out_var), place_(place) {}
void operator()(const phi::DenseTensor &in_tensor) const {
phi::DenseTensor *out_tensor = out_var_->GetMutable<phi::DenseTensor>();
if (platform::is_same_place(in_tensor.place(), place_)) {
out_tensor->ShareDataWith(in_tensor);
#ifdef PADDLE_WITH_IPU
} else if (platform::is_ipu_place(place_)) {
// For ipu, both in_tensor and out_tensor are allocated on cpu,
// PopART will copy tensor from host automatically,
// no TensorCopy() is required here.
out_tensor->ShareDataWith(in_tensor);
#endif
} else {
platform::DeviceContext *context =
platform::DeviceContextPool::Instance().Get(place_);
framework::TensorCopy(in_tensor, place_, *context, out_tensor);
}
out_tensor->set_lod(in_tensor.lod());
}
void operator()(const framework::Strings &in_str) const {
framework::Strings *out_str = out_var_->GetMutable<framework::Strings>();
out_str->resize(in_str.size());
*out_str = in_str;
}
void operator()(const phi::SparseCooTensor &in_tensor) const {
phi::SparseCooTensor *out_tensor =
out_var_->GetMutable<phi::SparseCooTensor>();
if (platform::is_same_place(in_tensor.place(), place_)) {
*out_tensor = in_tensor;
} else {
platform::DeviceContext *context =
platform::DeviceContextPool::Instance().Get(place_);
phi::DenseTensor indices, values;
framework::TensorCopy(in_tensor.indices(), place_, *context, &indices);
framework::TensorCopy(in_tensor.values(), place_, *context, &values);
out_tensor->SetMember(indices, values, in_tensor.meta());
}
}
private:
framework::Variable *out_var_;
const platform::Place &place_;
};
class FeedOp : public framework::OperatorBase {
public:
FeedOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
OP_INOUT_CHECK(HasInputs("X"), "Input", "X", "Feed");
OP_INOUT_CHECK(HasOutputs("Out"), "Output", "Out", "Feed");
auto feed_var_name = Input("X");
auto *feed_var = scope.FindVar(feed_var_name);
PADDLE_ENFORCE_NOT_NULL(
feed_var,
platform::errors::NotFound(
"Input varibale(%s) cannot be found in scope for operator 'Feed'.",
feed_var_name));
auto out_name = this->Output("Out");
auto *out_var = scope.FindVar(out_name);
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"Output variable(%s) cannot be found in scope for operator 'Feed'",
out_name));
auto col = Attr<int>("col");
const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x,
int col) {
PADDLE_ENFORCE_GE(col,
0,
platform::errors::InvalidArgument(
......@@ -120,25 +39,119 @@ class FeedOp : public framework::OperatorBase {
"operator 'Feed') of current feeding variable to be "
"no less than 0. But received column index = %d.",
col));
VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
<< " column to variable " << out_name;
auto &feed_list = feed_var->Get<framework::FeedList>();
auto feed_list = static_cast<const paddle::framework::FeedList*>(&x);
PADDLE_ENFORCE_LT(
static_cast<size_t>(col),
feed_list.size(),
feed_list->size(),
platform::errors::InvalidArgument(
"The column index of current feeding variable is expected to be "
"less than the length of feeding list. But received column index = "
"%d, the length of feeding list = %d",
col,
feed_list.size()));
feed_list->size()));
auto &feed_item = feed_list.at(static_cast<size_t>(col));
return feed_list->at(static_cast<size_t>(col));
}
FeedVariableVisitor visitor(out_var, place);
paddle::visit(visitor, feed_item);
template <typename Context>
void FeedDenseTensorKernel(const Context& dev_ctx,
const phi::ExtendedTensor& x,
int col,
phi::DenseTensor* out) {
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"Output cannot be found in scope for operator 'Feed'"));
const auto& feed_item = CheckAndGetFeedItem(x, col);
const auto& in_tensor = paddle::get<phi::DenseTensor>(feed_item);
const auto& place = dev_ctx.GetPlace();
if (platform::is_same_place(in_tensor.place(), place)) {
out->ShareDataWith(in_tensor);
} else {
framework::TensorCopy(in_tensor, place, dev_ctx, out);
}
out->set_lod(in_tensor.lod());
}
template <typename Context>
void FeedSparseCooTensorKernel(const Context& dev_ctx,
const phi::ExtendedTensor& x,
int col,
phi::SparseCooTensor* out) {
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"Output cannot be found in scope for operator 'Feed'"));
const auto& feed_item = CheckAndGetFeedItem(x, col);
const auto& in_tensor = paddle::get<phi::SparseCooTensor>(feed_item);
const auto& place = dev_ctx.GetPlace();
if (platform::is_same_place(in_tensor.place(), place)) {
*out = in_tensor;
} else {
phi::DenseTensor indices, values;
framework::TensorCopy(in_tensor.indices(), place, dev_ctx, &indices);
framework::TensorCopy(in_tensor.values(), place, dev_ctx, &values);
out->SetMember(indices, values, in_tensor.meta());
}
}
template <typename Context>
void FeedStringsKernel(const Context& dev_ctx,
const phi::ExtendedTensor& x,
int col,
phi::ExtendedTensor* out) {
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"Output cannot be found in scope for operator 'Feed'"));
const auto& feed_item = CheckAndGetFeedItem(x, col);
auto strs_out = static_cast<framework::Strings*>(out);
const auto& in_str = paddle::get<framework::Strings>(feed_item);
strs_out->resize(in_str.size());
*strs_out = in_str;
}
class FeedOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "feed");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "feed");
if (ctx->IsRuntime()) {
framework::Variable* x_var =
PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
auto& x = x_var->Get<framework::FeedList>();
int col = ctx->Attrs().Get<int>("col");
auto& feed_item = x[col];
if (feed_item.index() == 0) {
const auto& feed_item = CheckAndGetFeedItem(x, col);
auto& feed_tensor = PADDLE_GET_CONST(phi::DenseTensor, feed_item);
ctx->SetOutputDim("Out", feed_tensor.dims());
} else if (feed_item.index() == 1) {
auto& feed_str = PADDLE_GET_CONST(framework::Strings, feed_item);
framework::Variable* out_var =
PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
out_var->GetMutable<framework::Strings>()->resize(feed_str.size());
} else {
auto& feed_sparse_tensor =
PADDLE_GET_CONST(phi::SparseCooTensor, feed_item);
framework::Variable* out_var =
PADDLE_GET(framework::Variable*, ctx->GetOutputVarPtrs("Out")[0]);
out_var->GetMutable<phi::SparseCooTensor>()->set_meta(
feed_sparse_tensor.meta());
out_var->GetMutable<phi::SparseCooTensor>()->SetCoalesced(
feed_sparse_tensor.coalesced());
out_var->GetMutable<phi::SparseCooTensor>()->SetIndicesDict(
feed_sparse_tensor.GetIndicesDict());
}
}
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
}
};
......@@ -164,9 +177,152 @@ It should not be configured by users directly.
} // namespace operators
} // namespace paddle
// TODO(YuanRisheng): Maybe we need design a new registry macro for
// registering device independent kernels.
REGISTER_OPERATOR(
feed,
paddle::operators::FeedOp,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
paddle::operators::FeedOpInfoMaker);
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
CPU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
CPU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
CPU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CPUContext>,
ALL_DTYPE) {}
#if defined(PADDLE_WITH_MKLDNN)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
OneDNN,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::OneDNNContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
OneDNN,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::OneDNNContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
OneDNN,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::OneDNNContext>,
ALL_DTYPE) {}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
GPU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::GPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
GPU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::GPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
GPU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::GPUContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_XPU)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
XPU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::XPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
XPU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::XPUContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
XPU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::XPUContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_ASCEND_CL)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
npu,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
npu,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
npu,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CustomContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_MLU)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
CustomMLU,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
CustomMLU,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
CustomMLU,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CustomContext>,
ALL_DTYPE) {}
#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
PD_REGISTER_GENERAL_KERNEL(
feed_dense_tensor,
custom_cpu,
ALL_LAYOUT,
paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_sparse_coo_tensor,
custom_cpu,
ALL_LAYOUT,
paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
ALL_DTYPE) {}
PD_REGISTER_GENERAL_KERNEL(
feed_strings,
custom_cpu,
ALL_LAYOUT,
paddle::operators::FeedStringsKernel<phi::CustomContext>,
ALL_DTYPE) {}
#endif
......@@ -33,6 +33,27 @@ class CumOp : public framework::OperatorWithKernel {
}
};
class CumGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cumsum");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
"Input",
"Out@GRAD",
"cumsum");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
return phi::KernelKey(input_data_type, ctx.GetPlace());
}
};
class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
......@@ -69,12 +90,13 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
protected:
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("cumsum");
grad_op->SetInput("X", this->OutputGrad("Out"));
grad_op->SetOutput("Out", this->InputGrad("X"));
grad_op->SetType("cumsum_grad");
grad_op->SetInput("X", this->Input("X"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs());
grad_op->SetAttr("reverse",
!PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
}
};
......@@ -153,6 +175,7 @@ using CPU = phi::CPUContext;
DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
CumsumInferShapeFunctor,
PD_INFER_META(phi::CumScalarAxisInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp,
LogcumsumexpInferShapeFunctor,
PD_INFER_META(phi::CumInferMeta));
......@@ -169,6 +192,7 @@ REGISTER_OPERATOR(logcumsumexp,
ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>,
LogcumsumexpInferShapeFunctor);
REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp);
REGISTER_OPERATOR(cumsum_grad, ops::CumGradOp);
REGISTER_OP_VERSION(cumsum).AddCheckpoint(
R"ROC(
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
class SlogDeterminantOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
}
};
class SlogDeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input", "(Tensor) The input tensor of SlogDeterminant.");
AddOutput("Out",
"(Tensor) The output tensor containing the sign of the"
"determinant and the natural logarithm"
"of the absolute value of determinant,");
AddComment(R"DOC(
SlogDeterminant Operator.)DOC");
}
};
class SlogDeterminantGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(
ctx->HasInput("Input"), "Input", "Input", "SlogDeterminantGradOp");
OP_INOUT_CHECK(
ctx->HasInput("Out"), "Input", "Out", "SlogDeterminantGradOp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
"Input",
framework::GradVarName("Out"),
"SlogDeterminantGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")),
"Output",
framework::GradVarName("Input"),
"SlogDeterminantGradOp");
ctx->SetOutputDim(framework::GradVarName("Input"),
ctx->GetInputDim("Input"));
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.GetPlace());
}
};
template <typename T>
class SlogDeterminantGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("slogdeterminant_grad");
grad_op->SetInput("Input", this->Input("Input"));
grad_op->SetInput("Out", this->Output("Out"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("Input"),
this->InputGrad("Input"));
grad_op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
"Input");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant,
SlogDeterminantInferShapeFunctor,
PD_INFER_META(phi::UnchangedInferMeta));
REGISTER_OPERATOR(slogdeterminant,
ops::SlogDeterminantOp,
ops::SlogDeterminantOpMaker,
ops::SlogDeterminantGradOpMaker<paddle::framework::OpDesc>,
ops::SlogDeterminantGradOpMaker<paddle::imperative::OpBase>,
SlogDeterminantInferShapeFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant_grad,
SlogDeterminantGradInferShapeFunctor,
PD_INFER_META(phi::GeneralUnaryGradInferMeta));
REGISTER_OPERATOR(slogdeterminant_grad,
ops::SlogDeterminantGradOp,
SlogDeterminantGradInferShapeFunctor) // reuse det grad op
......@@ -19,6 +19,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
namespace paddle {
namespace operators {
......@@ -63,6 +66,34 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker<T> {
}
};
class ElementwiseMulGradCompositeOpMaker
: public prim::GradCompositeOpMakerBase {
using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
public:
void Apply() override {
auto x = this->GetSingleForwardInput("X");
auto y = this->GetSingleForwardInput("Y");
auto out_grad = this->GetSingleOutputGrad("Out");
auto x_grad = this->GetSingleInputGrad("X");
auto x_grad_p = this->GetOutputPtr(&x_grad);
auto x_grad_name = this->GetOutputName(x_grad);
auto y_grad = this->GetSingleInputGrad("Y");
auto y_grad_p = this->GetOutputPtr(&y_grad);
auto y_grad_name = this->GetOutputName(y_grad);
prim::multiply_grad<prim::DescTensor>(
x,
y,
out_grad,
static_cast<int>(this->Attr<int>("axis")),
x_grad_p,
y_grad_p);
VLOG(3) << "Runing mul_grad composite func";
this->RecoverOutputName(x_grad, x_grad_name);
this->RecoverOutputName(y_grad, y_grad_name);
}
};
template <typename T>
class ElementwiseMulDoubleGradMaker : public framework::SingleGradOpMaker<T> {
public:
......@@ -123,7 +154,8 @@ REGISTER_OPERATOR(elementwise_mul,
ops::ElementwiseMulOpMaker,
ops::ElementwiseOpInferVarType,
ops::ElementwiseMulOpGradMaker<paddle::framework::OpDesc>,
ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>);
ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>,
ops::ElementwiseMulGradCompositeOpMaker);
REGISTER_OPERATOR(
elementwise_mul_grad,
ops::ElementwiseOpGrad,
......
......@@ -20,6 +20,9 @@ limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
......@@ -190,6 +193,24 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
class ExpandV2GradCompositeOpMaker : public prim::GradCompositeOpMakerBase {
using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
public:
void Apply() override {
auto x = this->GetSingleForwardInput("X");
auto out_grad = this->GetSingleOutputGrad("Out");
auto x_grad = this->GetSingleInputGrad("X");
auto x_grad_p = this->GetOutputPtr(&x_grad);
auto x_grad_name = this->GetOutputName(x_grad);
auto shape = this->Attr<std::vector<int>>("shape");
prim::expand_grad<prim::DescTensor>(
x, out_grad, paddle::experimental::IntArray(shape), x_grad_p);
VLOG(3) << "Runing expand_v2 composite func";
this->RecoverOutputName(x_grad, x_grad_name);
}
};
template <typename T>
class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
......@@ -223,6 +244,7 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(expand_v2,
ops::ExpandV2Op,
ops::ExpandV2OpMaker,
ops::ExpandV2GradCompositeOpMaker,
ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>,
ExpandInferShapeFunctor);
......
......@@ -256,6 +256,16 @@ def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
op_item['no_need_buffer'] = get_param_list_alias(
op_item['no_need_buffer'], args_map
)
if 'data_transform' in op_item and op_item['data_transform']:
data_trans_item = op_item['data_transform']
if 'skip_transform' in data_trans_item:
data_trans_item['skip_transform'] = get_param_list_alias(
data_trans_item['skip_transform'], args_map
)
if 'support_trans_dtype' in data_trans_item:
data_trans_item['support_trans_dtype'] = get_param_list_alias(
data_trans_item['support_trans_dtype'], args_map
)
process_scalar(op_item, scalar_configs)
process_int_array(op_item, int_array_configs)
......
......@@ -427,7 +427,41 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
else:
no_buffer_args = None
# TODO(chenfeiyu): data_transform
# add data_transform tag for every input.
# the format is {data_transform : {skip_transform : [x, z], support_trans_dtype : y}}
for input in inputs:
input["data_transform"] = {}
if "data_transform" in op_entry:
skip_trans_args = []
support_trans_args = []
data_trans = op_entry["data_transform"]
if "skip_transform" in data_trans:
skip_trans_args = parse_plain_list(data_trans["skip_transform"])
for name in skip_trans_args:
assert (
name in input_names
), f"{op_name} has an skip_transform input: '{name}' which is not an input."
data_trans["skip_transform"] = skip_trans_args
if "support_trans_dtype" in data_trans:
support_trans_args = parse_plain_list(
data_trans["support_trans_dtype"]
)
for name in support_trans_args:
assert (
name in input_names
), f"{op_name} has an support_trans_dtype input: '{name}' which is not an input."
data_trans["support_trans_dtype"] = support_trans_args
for input in inputs:
if input["name"] in skip_trans_args:
input["data_transform"]["skip_trans_args"] = True
else:
input["data_transform"]["skip_trans_args"] = False
if input["name"] in support_trans_args:
input["data_transform"]["support_trans_dtype"] = True
else:
input["data_transform"]["support_trans_dtype"] = False
else:
data_trans = None
op = {
"name": op_name,
......@@ -435,6 +469,7 @@ def parse_op_entry(op_entry: Dict[str, Any], name_field="op"):
"attrs": attrs,
"outputs": outputs,
"no_need_buffer": no_buffer_args,
"data_transform": data_trans,
}
# invokes another op ?
......
......@@ -117,6 +117,15 @@ static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}}
{# --------------------------------------- name mapping ---------------------------------------------- #}
{% macro name_map(op) %}
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
{% set kernel_args = op["kernel"]["param"] %}
{{get_input_list(op["inputs"], kernel_args)}};
......@@ -136,15 +145,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
return sig;
{%endif%}
}
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
{% endmacro %}
{% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #}
......@@ -172,6 +172,15 @@ ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
{%- endmacro %}
{% macro sparse_op_name_map(op) %}
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
{% set kernel_args = op["kernel"]["param"] %}
{{get_input_list(op["inputs"], kernel_args)}};
......@@ -188,15 +197,6 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs));
return sig;
}
/*
******************************************************************
NOTE: The following codes are for 'get_compat_kernel_signature.py'
All possible KernelSignatures returned by {{op["name"] | to_pascal_case }}OpArgumentMapping:
{{op | cartesian_prod_mapping}}
******************************************************************
*/
{% endmacro %}
{% macro register_base_kernel_name(op) %}
......@@ -284,6 +284,32 @@ phi::KernelKey GetExpectedKernelType(
}
{% endmacro %}
{% macro get_kernel_for_var(op) %} {# only for data_transform #}
{% set skip_args = op["data_transform"]["skip_transform"] %}
{% set var_name = "var_name" %}
{% set skip_args_len = skip_args | length %}
phi::KernelKey GetKernelTypeForVar(
const std::string& {{var_name}},
const phi::DenseTensor& tensor,
const phi::KernelKey& expected_kernel_type) const override {
if (
{%- for skip_arg in skip_args -%}
var_name == "{{ skip_arg }}"
{%- if skip_args_len != 1 and loop.index != skip_args_len %} || {% endif -%}
{%- endfor -%}
){
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
}
else{
return phi::KernelKey(
tensor.place(), tensor.layout(), expected_kernel_type.dtype());
}
}
{% endmacro %}
{# --------------------------------------- operator ---------------------------------------------- #}
{% macro operator(op) %}
class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
......@@ -296,6 +322,14 @@ class {{op["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKerne
{% filter indent(2, True)%}
{{get_expected_kernel(op)}}
{% endfilter %}
{%- if "data_transform" in op and op["data_transform"] is not none -%}
{%- if "skip_transform" in op["data_transform"] -%}
{% filter indent(2, True) %}
{{get_kernel_for_var(op)}}
{% endfilter %}
{%- endif %}
{%- endif -%}
{# TODO(lizhiyu): add the 'support_trans_dtype' #}
{% endif %}
};
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/ternary.h"
namespace paddle {
namespace operators {
class GraphSendRecvOP : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context().GetPlace());
}
};
class GraphSendRecvGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context().GetPlace());
}
};
class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor with data type float32, float64, int32, int64.");
AddInput("Src_index", "The source index tensor.");
AddInput("Dst_index", "The destination index tensor.");
AddInput("Out_size",
"(Tensor<int>, optional). The 0th dimension of the output."
"It has a higher priority than Attr(out_size).")
.AsDispensable();
AddOutput("Out", "Output tensor of graph_send_recv op.");
AddOutput("Dst_count",
"Count tensor of Dst_index, mainly for MEAN reduce_op.")
.AsIntermediate();
AddAttr<std::string>("reduce_op",
"(string, default 'SUM')"
"Define different pool types to receive the result "
"tensors of Dst_index.")
.SetDefault("SUM")
.InEnum({"SUM", "MEAN", "MIN", "MAX"});
AddAttr<std::vector<int64_t>>(
"out_size",
"(vector<int64_t>, default {0})"
"Define the first dimension of Output tensor."
"If set default {0}, then the shape of Out is the same with X.")
.SetDefault({0});
AddComment(R"DOC(
Graph Learning Send_Recv combine operator.
$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
intermediate memory consumption in the process of message passing.
Take `x` as the input tensor, we first use `src_index` to gather corresponding data,
and then use `dst_index` to update the corresponding position of output tensor in different
pooling types, like sum, mean, max, or min.
)DOC");
}
};
template <typename T>
class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("graph_send_recv_grad");
op->SetInput("Src_index", this->Input("Src_index"));
op->SetInput("Dst_index", this->Input("Dst_index"));
op->SetInput("X", this->Input("X"));
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
}
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
op->SetInput("Out", this->Output("Out"));
}
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv,
GraphSendRecvInferShapeFunctor,
PD_INFER_META(phi::SendURecvInferMeta));
REGISTER_OPERATOR(graph_send_recv,
ops::GraphSendRecvOP,
ops::GraphSendRecvOpMaker,
ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
GraphSendRecvInferShapeFunctor);
REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
class GraphSendUERecvOP : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context().GetPlace());
}
};
class GraphSendUERecvGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
auto y_dims = ctx->GetInputDim("Y");
ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context().GetPlace());
}
};
class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor with data type float32, float64, int32, int64.");
AddInput("Y",
"The input edge weight tensor, data type should be same with X");
AddInput("Src_index", "The source index tensor.");
AddInput("Dst_index", "The destination index tensor.");
AddInput("Out_size",
"(Tensor<int>, optional). The 0th dimension of the output."
"It has a higher priority than Attr(out_size).")
.AsDispensable();
AddOutput("Out", "Output tensor of graph_send_ue_recv op.");
AddOutput("Dst_count",
"Count tensor of Dst_index, mainly for MEAN reduce_op.")
.AsIntermediate();
AddAttr<std::string>("message_op",
"(string, default 'ADD')"
"Define differenct computation types between X and E.")
.SetDefault("ADD")
.InEnum({"ADD", "MUL"});
AddAttr<std::string>("reduce_op",
"(string, default 'SUM')"
"Define different pool types to receive the result "
"tensors of Dst_index.")
.SetDefault("SUM")
.InEnum({"SUM", "MEAN", "MIN", "MAX"});
AddAttr<std::vector<int64_t>>(
"out_size",
"(vector<int64_t>, default {0})"
"Define the first dimension of Output tensor."
"If set default {0}, then the shape of Out is the same with X.")
.SetDefault({0});
AddComment(R"DOC(
Graph Learning Send_UE_Recv combine operator.
$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
intermediate memory consumption in the process of message passing.
Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
and get the computation result. Then, use `dst_index` to update the corresponding position of output
tensor in different pooling types, like sum, mean, max, or min.
)DOC");
}
};
template <typename T>
class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("graph_send_ue_recv_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("Y", this->Input("Y"));
op->SetInput("Src_index", this->Input("Src_index"));
op->SetInput("Dst_index", this->Input("Dst_index"));
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
}
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
op->SetInput("Out", this->Output("Out"));
}
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
op->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv,
GraphSendUERecvInferShapeFunctor,
PD_INFER_META(phi::SendUERecvInferMeta));
REGISTER_OPERATOR(graph_send_ue_recv,
ops::GraphSendUERecvOP,
ops::GraphSendUERecvOpMaker,
ops::GraphSendUERecvGradOpMaker<paddle::framework::OpDesc>,
ops::GraphSendUERecvGradOpMaker<paddle::imperative::OpBase>,
GraphSendUERecvInferShapeFunctor);
REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp);
......@@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase {
// get output orginal name
std::string x_grad_name = this->GetOutputName(x_grad_t);
VLOG(3) << "Runing sum_grad composite func";
// call composite backward func
prim::sum_grad<prim::DescTensor>(
x, out_grad, axis, keep_dim, reduce_all, x_grad);
......
......@@ -95,7 +95,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
void operator()(framework::InferShapeContext *context) const override {
OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput");
OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput");
OP_INOUT_CHECK(context->HasOutputs("Out"), "Output", "Out", "SelectOutput");
OP_INOUT_CHECK(
context->HasOutputs("Out", true), "Output", "Out", "SelectOutput");
}
};
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
class SizeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto dtype = framework::proto::VarType::FP32; // dtype is not important
return phi::KernelKey(dtype, ctx.GetPlace());
}
phi::KernelKey GetKernelTypeForVar(
const std::string& var_name,
const phi::DenseTensor& tensor,
const phi::KernelKey& expected_kernel_type) const override {
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
}
};
class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input", "The input tensor.");
AddOutput("Out",
"The returned tensor, the data type "
"is int64_t, will be on the same device with the input Tensor.");
AddComment(R"DOC(
Size Operator.
Return the number of elements in the input.
)DOC");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(SizeOpNoNeedBufferVarInferer, "Input");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(size,
SizeInferShapeFunctor,
PD_INFER_META(phi::NumelInferMeta));
REGISTER_OPERATOR(
size,
ops::SizeOp,
ops::SizeOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
SizeInferShapeFunctor,
ops::SizeOpNoNeedBufferVarInferer);
......@@ -407,8 +407,8 @@ int BertTokenizer::Encode(
void BertTokenizer::BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text,
const vector<string>& batch_text_pair /* = vector<string>() */,
const framework::Strings& batch_text,
const framework::Strings& batch_text_pair /* = vector<string>() */,
bool is_split_into_words /* = false */,
const size_t max_seq_len /* = 0 */,
bool pad_to_max_seq_len /* = false */) const {
......
......@@ -100,8 +100,8 @@ class BertTokenizer {
bool pad_to_max_seq_len = false) const;
void BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text,
const vector<string>& batch_text_pair = vector<string>(),
const framework::Strings& batch_text,
const framework::Strings& batch_text_pair = framework::Strings(),
bool is_split_into_words = false,
const size_t max_seq_len = 0,
bool pad_to_max_seq_len = false) const;
......@@ -162,7 +162,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
} else {
tokenizer.BatchEncode(&batch_encode_inputs,
*text,
vector<string>(),
framework::Strings(),
is_split_into_words,
max_seq_len,
pad_to_max_seq_len);
......
......@@ -18,6 +18,7 @@
#include "paddle/phi/backends/all_context.h"
DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_bool(new_executor_use_cuda_graph);
namespace paddle {
namespace platform {
......@@ -43,7 +44,10 @@ void BeginCUDAGraphCapture(phi::GPUPlace place,
auto stream = dev_ctx->stream();
CUDAGraph::BeginCapture(place, stream, mode);
auto old_value = FLAGS_use_stream_safe_cuda_allocator;
// When using cuda graph in new executor, fast GC must be used.
// FLAGS_use_stream_safe_cuda_allocator should be true.
auto old_value = FLAGS_use_stream_safe_cuda_allocator &&
!FLAGS_new_executor_use_cuda_graph;
if (old_value) {
FLAGS_use_stream_safe_cuda_allocator = false;
}
......
......@@ -20,6 +20,7 @@ namespace platform {
void CudaProfilerInit(const std::string& output_file,
const std::string& output_mode,
const std::string& config_file) {
#if CUDA_VERSION < 11000
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
platform::errors::InvalidArgument(
"Unsupported cuda profiler output mode, expect `kvp` or "
......@@ -28,6 +29,7 @@ void CudaProfilerInit(const std::string& output_file,
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
#endif
}
void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
......
......@@ -183,6 +183,7 @@ class XPUDeviceContext : public phi::XPUContext {
virtual ~XPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
void CreateStream() { XPUContext::CreateStream(); }
};
template <>
......
generated/prim_api/eager_prim_api.cc
generated/prim_api/tmp_eager_prim_api.cc
generated/prim_api/*.h
add_subdirectory(auto_code_generated)
add_subdirectory(manual)
add_subdirectory(generated)
if(NOT (NOT WITH_PYTHON AND ON_INFER))
cc_library(
prim_api
......
......@@ -13,6 +13,6 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/utils/utils.h"
set(api_yaml_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml"
)
set(legacy_api_yaml_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml"
)
set(tmp_eager_prim_api_cc_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_eager_prim_api.cc"
)
set(tmp_prim_api_h_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_prim_api.h"
)
set(eager_prim_api_cc_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc"
)
set(prim_api_h_path
"${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/prim_api.h")
set(prim_api_gen_file
${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated/prim_gen.py)
message("prim api Code gen")
execute_process(
WORKING_DIRECTORY
${CMAKE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated
COMMAND
${PYTHON_EXECUTABLE} ${prim_api_gen_file} --api_yaml_path
${legacy_api_yaml_path} ${api_yaml_path} --prim_api_header_path
${tmp_prim_api_h_path} --eager_prim_api_source_path
${tmp_eager_prim_api_cc_path}
RESULT_VARIABLE _result)
if(${_result})
message(FATAL_ERROR "prim api genrate failed, exiting.")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
${tmp_prim_api_h_path} ${prim_api_h_path})
execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
${tmp_eager_prim_api_cc_path} ${eager_prim_api_cc_path})
message("copy tmp_xxx_prim_api to xxx_prim_api")
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# prim api list
white_ops_list = [
"pow",
"scale",
"multiply",
"unsqueeze",
"expand",
"full",
"reshape",
"divide",
"sum",
"exp",
]
inplace_out_type_map = {
"Tensor": "Tensor&",
"std::vector<Tensor>": "std::vector<Tensor>&",
}
inplace_optional_out_type_map = {
"Tensor": "paddle::optional<Tensor>&",
"std::vector<Tensor>": "paddle::optional<std::vector<Tensor>>&",
}
class BaseAPI:
def __init__(self, api_item_yaml):
# self.api = api_item_yaml['op']
self.api = api_item_yaml['name']
self.is_prim_api = False
if api_item_yaml['name'] in white_ops_list:
self.is_prim_api = True
#######################################
# inputs:
# names : [], list of input names
# input_info : {input_name : type}
# attrs:
# names : [], list of attribute names
# attr_info : { attr_name : (type, default_values)}
# outputs:
# names : [], list of output names
# types : [], list of output types
# out_size_expr : [], expression for getting size of vector<Tensor>
########################################
if self.is_prim_api:
(
self.inputs,
self.attrs,
self.outputs,
self.optional_vars,
) = self.parse_args(self.api, api_item_yaml)
self.inplace_map = api_item_yaml['inplace']
def get_api_func_name(self):
return self.api
# def is_inplace(self):
# if self.inplace_map
# return True
# return False
def get_input_tensor_args(self, inplace_flag=False):
input_args = []
inplace_type_map = {
"const Tensor&": "Tensor&",
"const paddle::optional<Tensor>&": "paddle::optional<Tensor>&",
"const std::vector<Tensor>&": "std::vector<Tensor>&",
"const paddle::optional<std::vector<Tensor>>&": "paddle::optional<std::vector<Tensor>>&",
}
for name in self.inputs['names']:
name = name.split('@')[0]
if inplace_flag and name in self.inplace_map.values():
input_args.append(
inplace_type_map[self.inputs['input_info'][name]]
+ ' '
+ name
)
else:
input_args.append(self.inputs['input_info'][name] + ' ' + name)
return input_args
def get_declare_args(self, inplace_flag=False):
declare_args = self.get_input_tensor_args(inplace_flag)
for name in self.attrs['names']:
default_value = ''
if self.attrs['attr_info'][name][1] is not None:
default_value = ' = ' + self.attrs['attr_info'][name][1]
declare_args.append(
self.attrs['attr_info'][name][0] + ' ' + name + default_value
)
return ", ".join(declare_args)
def get_declare_args_nodefault(self, inplace_flag=False):
declare_args = self.get_input_tensor_args(inplace_flag)
for name in self.attrs['names']:
declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
return ", ".join(declare_args)
def get_return_type(self, inplace_flag=False):
out_type_list = []
for i, out_type in enumerate(self.outputs['types']):
out_name = self.outputs['names'][i].split('@')[0]
if inplace_flag and out_name in self.inplace_map:
if self.inplace_map[out_name] in self.optional_vars:
out_type_list.append(
inplace_optional_out_type_map[out_type]
)
else:
out_type_list.append(inplace_out_type_map[out_type])
else:
out_type_list.append(out_type)
if len(out_type_list) == 1:
return out_type_list[0]
else:
return "std::tuple<" + ", ".join(out_type_list) + ">"
def parse_args(self, api_name, api_item_yaml):
optional_vars = []
for input_dict in api_item_yaml['inputs']:
if input_dict['optional']:
optional_vars.append(input_dict['name'])
inputs, attrs = self.parse_input_and_attr(
api_item_yaml['inputs'], api_item_yaml['attrs']
)
output_type_list, output_names, out_size_expr = self.parse_output(
api_item_yaml['outputs']
)
return (
inputs,
attrs,
{
'names': output_names,
'types': output_type_list,
'out_size_expr': out_size_expr,
},
optional_vars,
)
def parse_input_and_attr(self, inputs_list, attrs_list):
input_types_map = {
'Tensor': 'const Tensor&',
'Tensor[]': 'const std::vector<Tensor>&',
}
attr_types_map = {
'IntArray': 'const IntArray&',
'Scalar': 'const Scalar&',
'Scalar(int)': 'const Scalar&',
'Scalar(int64_t)': 'const Scalar&',
'Scalar(float)': 'const Scalar&',
'Scalar(dobule)': 'const Scalar&',
'Scalar[]': 'const std::vector<phi::Scalar>&',
'int': 'int',
'int32_t': 'int32_t',
'int64_t': 'int64_t',
'long': 'long',
'size_t': 'size_t',
'float': 'float',
'float[]': 'const std::vector<float>&',
'double': 'double',
'bool': 'bool',
'bool[]': 'const std::vector<bool>&',
'str': 'const std::string&',
'str[]': 'const std::vector<std::string>&',
'Place': 'const Place&',
'DataLayout': 'DataLayout',
'DataType': 'DataType',
'int64_t[]': 'const std::vector<int64_t>&',
'int[]': 'const std::vector<int>&',
}
optional_types_trans = {
'Tensor': 'const paddle::optional<Tensor>&',
'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
'int': 'paddle::optional<int>',
'int32_t': 'paddle::optional<int32_t>',
'int64_t': 'paddle::optional<int64_t>',
'float': 'paddle::optional<float>',
'double': 'paddle::optional<double>',
'bool': 'paddle::optional<bool>',
'Place': 'paddle::optional<const Place&>',
'DataLayout': 'paddle::optional<DataLayout>',
'DataType': 'paddle::optional<DataType>',
}
inputs = {'names': [], 'input_info': {}}
for input_dict in inputs_list:
inputs['names'].append(input_dict['name'])
if input_dict['optional']:
inputs['input_info'][input_dict['name']] = optional_types_trans[
input_dict['typename']
]
else:
inputs['input_info'][input_dict['name']] = input_types_map[
input_dict['typename']
]
attrs = {'names': [], 'attr_info': {}}
for attr_dict in attrs_list:
attrs['names'].append(attr_dict['name'])
if 'default_value' in attr_dict.keys():
default_value = attr_dict['default_value']
else:
default_value = None
if 'optional' in attr_dict.keys():
attrs['attr_info'][attr_dict['name']] = (
optional_types_trans[attr_dict['typename']],
default_value,
)
else:
attrs['attr_info'][attr_dict['name']] = (
attr_types_map[attr_dict['typename']],
default_value,
)
return inputs, attrs
def parse_output(self, outputs_list):
out_type_list = []
out_name_list = []
out_size_expr_list = []
for output_dict in outputs_list:
if output_dict['intermediate']:
continue
out_type_list.append(output_dict['typename'])
out_name_list.append(output_dict['name'])
if 'size' in output_dict.keys():
out_size_expr_list.append(output_dict['size'])
else:
out_size_expr_list.append(None)
return out_type_list, out_name_list, out_size_expr_list
class EagerPrimAPI(BaseAPI):
def __init__(self, api_item_yaml):
super().__init__(api_item_yaml)
def get_api__func_name(self):
api_func_name = self.api
# if self.is_inplace:
# if api_func_name[-1] != '_':
# api_func_name += '_'
# print("after api name", api_func_name)
return api_func_name
def gene_prim_api_declaration(self):
api_declaration = ""
api_func_name = self.get_api__func_name()
if api_func_name[-1] != '_':
api_declaration = f"""
template <typename T>
{self.get_return_type()} {api_func_name}({self.get_declare_args()});
"""
else:
api_declaration = (
api_declaration
+ f"""
template <typename T>
{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
"""
)
return api_declaration
def get_ad_func_input_args(self, inplace_flag=False):
input_args = []
for name in self.inputs['names']:
name = name.split('@')[0]
if inplace_flag and name in self.inplace_map.values():
input_args.append(name)
else:
input_args.append(name)
return input_args
def get_ad_func_args(self, inplace_flag=False):
ad_func_args = self.get_ad_func_input_args(inplace_flag)
for name in self.attrs['names']:
default_value = ''
if self.attrs['attr_info'][name][1] is not None:
default_value = ' = ' + self.attrs['attr_info'][name][1]
ad_func_args.append(name)
ad_func_args_str = ", ".join(ad_func_args)
return ad_func_args_str
def gene_ad_func_call(self):
api_func_name = self.get_api__func_name()
dygraph_ad_func_name = '::' + api_func_name + '_ad_func'
dygraph_ad_func_parameters = self.get_ad_func_args()
ad_func_call_str = f"""
VLOG(4) << "Eager Prim API {api_func_name}_ad_func call";
return {dygraph_ad_func_name}({dygraph_ad_func_parameters});
"""
# print("ad_func_call_str: ", ad_func_call_str)
return ad_func_call_str
def gene_eager_prim_api_code(self):
api_code = ""
indent = " "
api_func_name = self.get_api__func_name()
template = '<Tensor>'
# func decalaration
if api_func_name[-1] != '_':
api_code = f"""
template <>
{self.get_return_type()} {api_func_name}{template}({self.get_declare_args_nodefault()})
"""
else:
api_code = f"""
template <>
{self.get_return_type(inplace_flag=True)} {api_func_name}{template}({self.get_declare_args_nodefault(inplace_flag=True)})
"""
# func code
api_code = api_code + '{'
api_code += f"""{self.gene_ad_func_call()}"""
api_code += '}' + '\n'
return api_code
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import yaml
from prim_base import EagerPrimAPI
def header_include():
return """
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/place.h"
#include "paddle/utils/optional.h"
"""
def eager_source_include(header_file_path):
return """
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
"""
def api_namespace():
return (
"""
namespace paddle {
namespace prim {
""",
"""
using Tensor = paddle::experimental::Tensor;
using Scalar = paddle::experimental::Scalar;
using IntArray = paddle::experimental::IntArray;
using DataType = paddle::experimental::DataType;
""",
"""
} // namespace prim
} // namespace paddle
""",
)
def generate_api(api_yaml_path, header_file_path, eager_prim_source_file_path):
apis = []
for each_api_yaml in api_yaml_path:
with open(each_api_yaml, 'r') as f:
api_list = yaml.load(f, Loader=yaml.FullLoader)
if api_list:
apis.extend(api_list)
header_file = open(header_file_path, 'w')
eager_prim_source_file = open(eager_prim_source_file_path, 'w')
namespace = api_namespace()
header_file.write("#pragma once\n")
header_file.write(header_include())
header_file.write(namespace[0])
header_file.write(namespace[1])
include_header_file = (
"#include paddle/fluid/prim/api/generated/prim_api/prim_api.h"
)
eager_prim_source_file.write(eager_source_include(include_header_file))
eager_prim_source_file.write(namespace[0])
for api in apis:
prim_api = EagerPrimAPI(api)
if prim_api.is_prim_api:
header_file.write(prim_api.gene_prim_api_declaration())
eager_prim_source_file.write(prim_api.gene_eager_prim_api_code())
header_file.write(namespace[2])
eager_prim_source_file.write(namespace[2])
header_file.close()
eager_prim_source_file.close()
def main():
parser = argparse.ArgumentParser(
description='Generate PaddlePaddle C++ API files'
)
parser.add_argument(
'--api_yaml_path',
help='path to api yaml file',
nargs='+',
default=['paddle/phi/api/yaml/ops.yaml'],
)
parser.add_argument(
'--prim_api_header_path',
help='output of generated prim_api header code file',
default='paddle/fluid/prim/api/generated/prim_api/prim_api.h',
)
parser.add_argument(
'--eager_prim_api_source_path',
help='output of generated eager_prim_api source code file',
default='paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc',
)
options = parser.parse_args()
api_yaml_path = options.api_yaml_path
prim_api_header_file_path = options.prim_api_header_path
eager_prim_api_source_file_path = options.eager_prim_api_source_path
generate_api(
api_yaml_path,
prim_api_header_file_path,
eager_prim_api_source_file_path,
)
if __name__ == '__main__':
main()
......@@ -25,6 +25,7 @@
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
#include "paddle/fluid/prim/api/manual/utils/utils.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
......@@ -37,7 +38,7 @@ namespace paddle {
namespace prim {
template <>
Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
Tensor pow<DescTensor>(const Tensor& x, const Scalar& y) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp();
......@@ -55,7 +56,7 @@ Tensor pow<DescTensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
template <>
Tensor scale<DescTensor>(const Tensor& x,
const paddle::experimental::Scalar& scale,
const Scalar& scale,
float bias,
bool bias_after_scale) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
......@@ -95,63 +96,63 @@ Tensor multiply<DescTensor>(const Tensor& x, const Tensor& y) {
}
template <>
Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp();
op->SetType("expand_v2");
op->SetType("unsqueeze2");
op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
op->SetAttr("shape", new_shape);
std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
op->SetAttr("axes", new_shape);
op->CheckAttrs();
op->InferVarType(block);
return out;
}
template <>
Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
// Grad infershape
Tensor expand<DescTensor>(const Tensor& x, const IntArray& shape) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp();
op->SetType("elementwise_div");
op->SetType("expand_v2");
op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetInput("Y",
{std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
op->SetAttr("shape", new_shape);
op->CheckAttrs();
op->InferVarType(block);
op->InferShape(*block);
return out;
}
template <>
Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
Tensor divide<DescTensor>(const Tensor& x, const Tensor& y) {
// Grad infershape
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp();
op->SetType("unsqueeze2");
op->SetType("elementwise_div");
op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetInput("Y",
{std::static_pointer_cast<prim::DescTensor>(y.impl())->Name()});
op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
std::vector<int> new_shape(axis.GetData().begin(), axis.GetData().end());
op->SetAttr("axes", new_shape);
op->CheckAttrs();
op->InferVarType(block);
op->InferShape(*block);
return out;
}
template <>
Tensor full<DescTensor>(paddle::experimental::IntArray shape,
paddle::experimental::Scalar value,
paddle::experimental::DataType dtype,
paddle::platform::Place place) {
Tensor full<DescTensor>(const IntArray& shape,
const Scalar& value,
DataType dtype,
const Place& place) {
// Grad infershape
Tensor out = empty<DescTensor>({}, dtype, place);
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
......@@ -159,9 +160,8 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
op->SetType("fill_constant");
op->SetAttr("shape", shape.GetData());
PADDLE_ENFORCE_EQ(
((dtype == paddle::experimental::DataType::FLOAT32) ||
(dtype == paddle::experimental::DataType::FLOAT64) ||
(dtype == paddle::experimental::DataType::FLOAT16)),
((dtype == DataType::FLOAT32) || (dtype == DataType::FLOAT64) ||
(dtype == DataType::FLOAT16)),
true,
phi::errors::InvalidArgument(
"We only support float32/float16 for full, but we got data type: %s",
......@@ -177,9 +177,9 @@ Tensor full<DescTensor>(paddle::experimental::IntArray shape,
}
template <>
Tensor sum<DescTensor>(Tensor x,
paddle::experimental::IntArray axis,
paddle::experimental::DataType dtype,
Tensor sum<DescTensor>(const Tensor& x,
const IntArray& axis,
DataType dtype,
bool keepdim) {
// Grad infershape
Tensor out = empty<DescTensor>({}, dtype, paddle::Place());
......@@ -199,12 +199,12 @@ Tensor sum<DescTensor>(Tensor x,
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
op->CheckAttrs();
op->InferVarType(block);
// TODO(jiabin): This may have runtime shape skip infershape for now.
// TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
return out;
}
template <>
Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
Tensor reshape<DescTensor>(const Tensor& x, const IntArray& shape) {
// Grad infershape
Tensor out = empty<DescTensor>({}, x.dtype(), paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
......@@ -222,7 +222,23 @@ Tensor reshape<DescTensor>(Tensor x, paddle::experimental::IntArray shape) {
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
op->CheckAttrs();
op->InferVarType(block);
// TODO(jiabin): This may have runtime shape skip infershape for now.
// TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
return out;
}
template <>
Tensor exp<DescTensor>(const Tensor& x) {
Tensor out = empty<DescTensor>({}, phi::DataType::FLOAT32, paddle::Place());
framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock();
framework::OpDesc* op = block->AppendOp();
op->SetType("exp");
op->SetInput("X",
{std::static_pointer_cast<prim::DescTensor>(x.impl())->Name()});
op->SetOutput(
"Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
op->CheckAttrs();
op->InferVarType(block);
op->InferShape(*block);
return out;
}
} // namespace prim
......
add_subdirectory(prim_api)
add_subdirectory(utils)
......@@ -49,7 +49,7 @@ void subtract_grad(const Tensor& x,
sum<T>(scale_out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
set_output<T>(dy_tmp, dy);
// dy->set_impl(dy_tmp.impl());
} else {
by_pass<T>(scale_out_grad, dy);
}
......@@ -62,7 +62,6 @@ void subtract_grad(const Tensor& x,
sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
set_output<T>(dx_tmp, dx);
// dx->set_impl(dx_tmp.impl());
} else {
by_pass<T>(out_grad, dx);
}
......@@ -84,7 +83,6 @@ void add_grad(const Tensor& x,
sum<T>(out_grad, phi::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
set_output<T>(dy_tmp, dy);
// dy->set_impl(dy_tmp.impl());
} else {
by_pass<T>(out_grad, dy);
}
......@@ -97,7 +95,6 @@ void add_grad(const Tensor& x,
sum<T>(out_grad, phi::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
set_output<T>(dx_tmp, dx);
// dx->set_impl(dx_tmp.impl());
} else {
by_pass<T>(out_grad, dx);
}
......@@ -139,7 +136,6 @@ void sum_grad(const Tensor& x,
x_grad_tmp = expand<T>(out_grad, x_dim);
}
set_output<T>(x_grad_tmp, x_grad);
// x_grad->set_impl(x_grad_tmp.impl());
}
template <typename T>
......@@ -163,10 +159,8 @@ void divide_grad(const Tensor& x,
sum<T>(dy_res, phi::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
set_output<T>(dy_tmp, dy);
// dy->set_impl(dy_tmp.impl());
} else {
set_output<T>(dy_res, dy);
// dy->set_impl(dy_res.impl());
}
} // indicate we will compute dy
if (dx) {
......@@ -181,10 +175,8 @@ void divide_grad(const Tensor& x,
sum<T>(dx_res, phi::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
set_output<T>(dx_tmp, dx);
// dx->set_impl(dx_tmp.impl());
} else {
set_output<T>(dx_res, dx);
// dx->set_impl(dx_res.impl());
}
} // indicate we will compute dx
}
......@@ -196,7 +188,6 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
auto tmp = divide<T>(div_x, out);
auto x_grad_tmp = multiply<T>(out_grad, tmp);
set_output<T>(x_grad_tmp, x_grad);
// x_grad->set_impl(x_grad_tmp.impl());
}
}
} // namespace prim
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
#include "paddle/phi/capi/include/wrapper_base.h"
namespace paddle {
namespace prim {
template <>
Tensor pow<Tensor>(const Tensor& x, const paddle::experimental::Scalar& y) {
return ::pow_ad_func(x, y);
}
template <>
Tensor scale<Tensor>(const Tensor& x,
const paddle::experimental::Scalar& scale,
float bias,
bool bias_after_scale) {
return ::scale_ad_func(x, scale, bias, bias_after_scale);
}
template <>
Tensor multiply<Tensor>(const Tensor& x, const Tensor& y) {
return ::multiply_ad_func(x, y);
}
template <>
Tensor expand<Tensor>(const Tensor& x, const IntArray& shape) {
return ::expand_ad_func(x, shape);
}
template <>
Tensor unsqueeze<Tensor>(const Tensor& x, const IntArray& axis) {
return ::unsqueeze_ad_func(x, axis);
}
template <>
Tensor divide<Tensor>(const Tensor& x, const Tensor& y) {
return ::divide_ad_func(x, y);
}
template <>
Tensor full<Tensor>(paddle::experimental::IntArray shape,
paddle::experimental::Scalar value,
paddle::experimental::DataType dtype,
paddle::platform::Place place) {
return ::full_ad_func(shape, value, dtype, place);
}
template <>
Tensor sum<Tensor>(Tensor x, IntArray axis, DataType dtype, bool keepdim) {
return ::sum_ad_func(x, axis, dtype, keepdim);
}
template <>
Tensor reshape<Tensor>(Tensor x, IntArray shape) {
return ::reshape_ad_func(x, shape);
}
} // namespace prim
} // namespace paddle
......@@ -12,50 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// prim api which can't be generated
#pragma once
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/utils/optional.h"
namespace paddle {
namespace prim {
using Tensor = paddle::experimental::Tensor;
using IntArray = paddle::experimental::IntArray;
using Scalar = paddle::experimental::Scalar;
template <typename T>
Tensor pow(const Tensor& x, const Scalar& y);
template <typename T>
Tensor scale(const Tensor& X,
const Scalar& scale,
float bias,
bool bias_after_scale);
template <typename T>
Tensor multiply(const Tensor& x, const Tensor& y);
template <typename T>
Tensor expand(const Tensor& x, const IntArray& shape);
template <typename T>
Tensor unsqueeze(const Tensor& x, const IntArray& axis);
template <typename T>
Tensor divide(const Tensor& x, const Tensor& y);
template <typename T>
Tensor full(IntArray shape,
Scalar value,
DataType dtype = DataType::FLOAT32,
Place place = CPUPlace());
template <typename T>
Tensor sum(Tensor x,
IntArray axis = {},
DataType dtype = DataType::UNDEFINED,
bool keepdim = false);
template <typename T>
Tensor reshape(Tensor x, IntArray shape);
} // namespace prim
namespace paddle {
namespace prim {} // namespace prim
} // namespace paddle
......@@ -16,11 +16,12 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/ddim.h"
using IntArray = paddle::experimental::IntArray;
namespace paddle {
namespace prim {
// We put some api like utils here
......@@ -42,42 +43,40 @@ void set_output(const paddle::experimental::Tensor& x_tmp,
paddle::experimental::Tensor* x);
// These method don't need to be specified
static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
const phi::DDim& y_dims) {
static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
const phi::DDim& in_dims) {
std::vector<int64_t> result;
PADDLE_ENFORCE_GE(phi::product(x_dims),
phi::product(y_dims),
phi::errors::InvalidArgument(
"Only x_dims >= y_dims is accepted for "
"get_reduce_dims, but we got x_dims: %s, y_dims: %s",
x_dims,
y_dims));
int bat = x_dims.size() - y_dims.size();
int bat = dout_dims.size() - in_dims.size();
for (int i = 0; i < bat; ++i) {
result.push_back(i);
}
for (int i = 0; i < y_dims.size(); ++i) {
if (y_dims[i] == 1) {
for (int i = 0; i < in_dims.size(); ++i) {
if (in_dims[i] == 1) {
result.push_back(i + bat);
} else {
PADDLE_ENFORCE_EQ(
y_dims[i],
x_dims[i + bat],
in_dims[i],
dout_dims[i + bat],
platform::errors::InvalidArgument(
"ReduceDims dimension mismatch. Operands could "
"not be broadcast together with the shape of x_dims = [%s] and "
"the shape of y_dims = [%s]. Received [%d] in X is not equal to "
"not be broadcast together with the shape of dout = [%s] and "
"the shape of in_dims = [%s]. Received [%d] in X is not equal to "
"[%d] in Y at i:%d.",
x_dims,
y_dims,
x_dims[i + bat],
y_dims[i],
dout_dims,
in_dims,
dout_dims[i + bat],
in_dims[i],
i));
}
}
auto res_dims = phi::make_ddim(result);
VLOG(4) << "Reduce Dims is: " << res_dims;
return res_dims;
return phi::make_ddim(result);
}
static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
const phi::DDim& y_dims) {
auto out_dims = paddle::operators::details::BroadcastTwoDims(x_dims, y_dims);
return get_reduce_dims_from_out(out_dims, x_dims);
}
} // namespace prim
} // namespace paddle
......@@ -477,6 +477,9 @@ class GradCompositeOpMakerBase {
void RecoverOutputName(const paddle::experimental::Tensor& output,
const std::string& origin_name) {
if (origin_name == framework::kEmptyVarName) return;
VLOG(4) << "Recover: "
<< static_cast<prim::DescTensor*>(output.impl().get())->Name()
<< " To: " << origin_name;
prim::StaticCompositeContext::Instance().GetBlock()->RenameVar(
static_cast<prim::DescTensor*>(output.impl().get())->Name(),
origin_name);
......
......@@ -26,9 +26,9 @@ static PyObject *eager_api_run_program(PyObject *self,
PyObject *kwargs) {
PyThreadState *tstate = nullptr;
try {
auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
auto X = GetTensorListFromArgs("run_program", "X", args, 0, true);
auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true);
auto OutScope =
GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
......
......@@ -1503,7 +1503,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
using Strings = std::vector<std::string>;
using Strings = paddle::framework::Strings;
auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0);
auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
*var_tensor->GetMutable<Strings>() = strings;
......
......@@ -184,7 +184,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
value[i] = ddim[i];
}
}
if (!egr::IsVariableCompatTensor(self->tensor)) {
auto desired_layout =
paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
auto default_layout =
......@@ -200,17 +200,18 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
std::vector<int64_t> dims = value;
if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") {
// NCHW -> NHWC
VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " "
<< value[1] << " " << value[2] << " " << value[3] << " to "
VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0]
<< " " << value[1] << " " << value[2] << " " << value[3] << " to "
<< dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
value[0] = dims[0];
value[1] = dims[2];
value[2] = dims[3];
value[3] = dims[1];
} else if (change_dim && phi::DataLayoutToString(desired_layout) == "NHWC") {
} else if (change_dim &&
phi::DataLayoutToString(desired_layout) == "NHWC") {
// NHWC -> NCHW
VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " "
<< value[1] << " " << value[2] << " " << value[3] << " to "
VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0]
<< " " << value[1] << " " << value[2] << " " << value[3] << " to "
<< dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2]
<< " " << dims[1];
value[0] = dims[0];
......@@ -218,6 +219,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
value[2] = dims[1];
value[3] = dims[2];
}
}
return ToPyObject(value);
EAGER_CATCH_AND_THROW_RETURN_NULL
......
......@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) {
m.def("_cleanup_mmap_fds",
[]() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
m.def("_set_max_memory_map_allocation_pool_size", [](int32_t size) {
memory::allocation::MemoryMapAllocationPool::Instance().SetMaxPoolSize(
size);
});
#endif
m.def("start_imperative_gperf_profiler",
......
......@@ -970,7 +970,7 @@ All parameter, weight, gradient are variables in Paddle.
}
})
.def("set_string_list",
[](Variable &self, Strings str_list) {
[](Variable &self, std::vector<std::string> str_list) {
*self.GetMutable<Strings>() = str_list;
})
.def("set_vocab",
......@@ -1926,7 +1926,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable",
static_cast<void (*)( // NOLINT
Scope *,
const Strings &,
const std::vector<std::string> &,
const std::string &,
size_t)>(&framework::SetFeedVariable));
m.def("get_fetch_variable",
......
......@@ -182,6 +182,7 @@ limitations under the License. */
#include "pybind11/stl.h"
DECLARE_bool(use_mkldnn);
DECLARE_bool(use_shm_cache);
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
......@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) { // NOLINT
int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_EXCLUSIVE;
std::string handle = memory::allocation::GetIPCName();
int find_id = -1;
if (FLAGS_use_shm_cache) {
find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, data_size); // NOLINT
}
if (find_id != -1) {
handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT
}
auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation(
handle, flags, data_size);
handle, flags, data_size, find_id);
// copy data & reset holder
if (platform::is_cuda_pinned_place(holder->place())) {
......@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) { // NOLINT
size_t size = t[1].cast<size_t>();
int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_NOCREATE;
int find_id = -1;
if (FLAGS_use_shm_cache) {
find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT
}
auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation(
ipc_name, flags, size);
ipc_name, flags, size, find_id);
// 3. Rebuild Tensor
tensor.ResetHolderWithType(
......
......@@ -22,6 +22,9 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/string_tensor_utils.h"
#include "paddle/phi/core/tensor_utils.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
namespace paddle {
namespace experimental {
......@@ -54,6 +57,11 @@ bool HasAllocation(const phi::TensorBase& t) {
BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (t.place().GetType() == AllocationType::CUSTOM) {
phi::DeviceManager::SetDevice(t.place());
}
#endif
phi::Backend backend_key = phi::TransToPhiBackend(t.place());
BackendSet backend_set(backend_key);
if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) &&
......
......@@ -431,6 +431,7 @@
kernel :
func : exp_grad
inplace : (out_grad -> x_grad)
composite : exp_grad(out, out_grad, x_grad)
- backward_op : expm1_grad
forward : expm1 (Tensor x) -> Tensor(out)
......@@ -1085,6 +1086,30 @@
func : selu_grad
data_type : out
- backward_op : send_u_recv_grad
forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
param : [x]
kernel :
func : send_u_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : send_ue_recv_grad
forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : send_ue_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : send_uv_grad
forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out)
args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD")
......@@ -1191,6 +1216,17 @@
func : sinh_grad
inplace : (out_grad -> x_grad)
- backward_op : slogdet_grad
forward : slogdet (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
param : [x]
kernel :
func : slogdet_grad
data_type : out_grad
- backward_op : softplus_grad
forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out)
args : (Tensor x, Tensor out_grad, float beta, float threshold)
......
......@@ -316,9 +316,14 @@
- backward_op : cumsum_grad
forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
args : (Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
args : (Tensor x, Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
output : Tensor(x_grad)
invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse)
infer_meta :
func : UnchangedInferMeta
param: [x]
kernel :
func : cumsum_grad
data_type: x
- backward_op : deformable_conv_grad
forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out)
......@@ -475,6 +480,7 @@
func : expand_grad
no_need_buffer : x
backward : expand_double_grad
composite: expand_grad(x, out_grad, shape, x_grad_p)
- backward_op : exponential__grad
forward : exponential_ (Tensor x, float lam) -> Tensor(out)
......@@ -880,6 +886,7 @@
param : [x, y]
kernel :
func : multiply_grad
composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad)
backward : multiply_double_grad
- backward_op : multiply_triple_grad
......@@ -1201,30 +1208,6 @@
data_type : x
optional : summed_ids
- backward_op : send_u_recv_grad
forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
param : [x]
kernel :
func : send_u_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : send_ue_recv_grad
forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : send_ue_recv_grad
data_type : out_grad
optional: out, dst_count
- backward_op : sigmoid_cross_entropy_with_logits_grad
forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out)
args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index)
......@@ -1260,16 +1243,6 @@
backward : slice_double_grad
no_need_buffer : input
- backward_op : slogdet_grad
forward : slogdet (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
param : [x]
kernel :
func : slogdet_grad
- backward_op : softmax_grad
forward : softmax (Tensor x, int axis) -> Tensor(out)
args : (Tensor out, Tensor out_grad, int axis)
......
......@@ -1298,16 +1298,6 @@
kernel :
func : not_equal
- op : numel
args : (Tensor x)
output : Tensor(size)
infer_meta :
func : NumelInferMeta
kernel :
func : numel
data_transform:
skip_transform : x
- op : one_hot
args : (Tensor x, Scalar(int) num_classes)
output : Tensor(out)
......@@ -1588,28 +1578,6 @@
data_type : x
backward : segment_pool_grad
- op : send_u_recv
args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendURecvInferMeta
kernel :
func : send_u_recv
data_type : x
intermediate : dst_count
backward : send_u_recv_grad
- op : send_ue_recv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size)
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendUERecvInferMeta
kernel :
func : send_ue_recv
data_type : x
intermediate : dst_count
backward : send_ue_recv_grad
- op : sgd_
args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision)
output : Tensor(param_out), Tensor(master_param_out)
......@@ -1663,15 +1631,6 @@
func : slice
backward : slice_grad
- op : slogdet
args : (Tensor x)
output : Tensor
infer_meta :
func : UnchangedInferMeta
kernel :
func : slogdet
backward : slogdet_grad
- op : softmax
args : (Tensor x, int axis)
output : Tensor(out)
......
......@@ -993,6 +993,12 @@
outputs :
{out : Out, total_weight : Total_weight}
- op : numel(size)
inputs :
x : Input
outputs :
size : Out
- op : overlap_add
backward : overlap_add_grad
inputs :
......@@ -1215,6 +1221,28 @@
outputs :
out : Out
- op : send_u_recv(graph_send_recv)
backward : send_u_recv_grad(graph_send_recv_grad)
inputs :
{x : X, src_index : Src_index, dst_index : Dst_index}
outputs :
{out : Out, dst_count : Dst_count}
int_array :
out_size:
data_type : int64_t
tensor_name : Out_size
- op : send_ue_recv(graph_send_ue_recv)
backward : send_ue_recv_grad(graph_send_ue_recv_grad)
inputs :
{x : X, y : Y, src_index : Src_index, dst_index : Dst_index}
outputs :
{out : Out, dst_count : Dst_count}
int_array :
out_size:
data_type : int64_t
tensor_name : Out_size
- op : send_uv (graph_send_uv)
backward : send_uv_grad (graph_send_uv_grad)
......@@ -1286,6 +1314,13 @@
extra :
attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
- op : slogdet(slogdeterminant)
backward : slogdet_grad(slogdeterminant_grad)
inputs :
x : Input
outputs :
out : Out
- op : softmax
backward : softmax_grad
inputs :
......
......@@ -871,6 +871,18 @@
kernel :
func : npu_identity
- op : numel
args : (Tensor x)
output : Tensor(size)
infer_meta :
func : NumelInferMeta
kernel :
func : numel
data_type : x
data_transform:
skip_transform : x
no_need_buffer : x
- op : overlap_add
args: (Tensor x, int hop_length, int axis=-1)
output: Tensor
......@@ -1029,6 +1041,28 @@
func : selu
backward : selu_grad
- op : send_u_recv
args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendURecvInferMeta
kernel :
func : send_u_recv
data_type : x
intermediate : dst_count
backward : send_u_recv_grad
- op : send_ue_recv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op="ADD", str reduce_op="SUM", IntArray out_size={0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : SendUERecvInferMeta
kernel :
func : send_ue_recv
data_type : x
intermediate : dst_count
backward : send_ue_recv_grad
- op : send_uv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
output : Tensor(out)
......@@ -1083,6 +1117,15 @@
func : sinh
backward : sinh_grad
- op : slogdet
args : (Tensor x)
output : Tensor
infer_meta :
func : UnchangedInferMeta
kernel :
func : slogdet
backward : slogdet_grad
- op : softplus
args : (Tensor x, float beta = 1.0, float threshold = 20.0f)
output : Tensor
......
......@@ -360,6 +360,7 @@ XPUOpMap& get_kl2_ops() {
{"log_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})},
{"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"logical_not", XPUKernelSet({phi::DataType::BOOL})},
{"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})},
{"masked_select",
......
......@@ -61,11 +61,13 @@ struct XPUContext::Impl {
~Impl() {
if (owned_ && context_ != nullptr) {
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
xpu_wait(context_->xpu_stream);
if (context_->xpu_stream) {
// manually destroy XPUStream here until xpu::api integrates this work
// into Context dtor
xpu_wait(context_->xpu_stream);
xpu_stream_destroy(context_->xpu_stream);
context_->xpu_stream = nullptr;
}
xpu::destroy_context(context_);
context_ = nullptr;
}
......@@ -73,11 +75,7 @@ struct XPUContext::Impl {
const Place& GetPlace() const { return place_; }
XPUStream stream() const {
auto s = context_->xpu_stream;
PD_CHECK(s != nullptr, "the xpu stream is nullptr.");
return s;
}
XPUStream stream() const { return context_->xpu_stream; }
xpu::Context* GetXContext() const {
PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
......@@ -103,13 +101,20 @@ struct XPUContext::Impl {
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
}
void SetXContext(xpu::Context* context) { context_ = context; }
void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
void CreateStream() {
if (context_->xpu_stream) {
VLOG(3) << "xpu stream is already created for current context";
return;
}
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&context_->xpu_stream));
}
bool owned_{false};
Place place_;
backends::xpu::XPUVersion xpu_version_;
......@@ -153,6 +158,8 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
impl_->SetBkclContext(context);
}
void XPUContext::CreateStream() { impl_->CreateStream(); }
void XPUContext::Init() { impl_->Init(); }
} // namespace phi
......@@ -46,6 +46,7 @@ class XPUContext : public DeviceContext,
// Return bkcl context.
xpu::BKCLContext_t bkcl_context() const;
void SetBkclContext(xpu::BKCLContext_t context);
void CreateStream();
// Wait for all operations completion in the stream.
void Wait() const override;
......
......@@ -134,7 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
default: {
size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
std::string device_type =
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
if (!device_type.empty()) {
os << device_type;
} else {
......@@ -178,7 +180,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::IPU;
} else {
return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
phi::GetOrRegisterGlobalDeviceTypeId(s));
phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(s));
}
}
......@@ -207,7 +210,9 @@ inline std::string BackendToString(const Backend& backend) {
default:
size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
std::string device_type =
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
if (!device_type.empty()) {
return device_type;
} else {
......
......@@ -16,7 +16,6 @@ limitations under the License. */
#include <sstream>
#include <string>
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/phi/api/ext/exception.h"
......@@ -54,7 +53,8 @@ std::string Place::DebugString() const {
std::ostringstream os;
os << "Place(";
if (alloc_type_ == AllocationType::CUSTOM) {
os << GetGlobalDeviceType(device_type_id_);
os << phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
} else {
os << AllocationTypeStr(alloc_type_);
}
......@@ -85,25 +85,29 @@ Place GetPinnedPlace(const Place &place) {
}
}
static std::unordered_map<std::string, size_t> global_registered_device_type_id;
static std::unordered_map<size_t, std::string> global_registered_device_type;
CustomRegisteredDeviceMap &CustomRegisteredDeviceMap::Instance() {
static CustomRegisteredDeviceMap g_custom_registered_device_map;
return g_custom_registered_device_map;
}
size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) {
size_t CustomRegisteredDeviceMap::GetOrRegisterGlobalDeviceTypeId(
const std::string &device_type) {
if (device_type.empty()) return 0;
if (global_registered_device_type_id.find(device_type) ==
global_registered_device_type_id.end()) {
size_t device_type_id = global_registered_device_type_id.size() + 1;
global_registered_device_type_id[device_type] = device_type_id;
global_registered_device_type[device_type_id] = device_type;
if (registered_device_type_id_.find(device_type) ==
registered_device_type_id_.end()) {
size_t device_type_id = registered_device_type_id_.size() + 1;
registered_device_type_id_[device_type] = device_type_id;
registered_device_type_[device_type_id] = device_type;
}
return global_registered_device_type_id[device_type];
return registered_device_type_id_[device_type];
}
std::string GetGlobalDeviceType(size_t device_type_id) {
if (global_registered_device_type.find(device_type_id) ==
global_registered_device_type.end())
std::string CustomRegisteredDeviceMap::GetGlobalDeviceType(
size_t device_type_id) {
if (registered_device_type_.find(device_type_id) ==
registered_device_type_.end())
return "";
return global_registered_device_type[device_type_id];
return registered_device_type_[device_type_id];
}
constexpr static int kAllocationTypeBitLength = 8;
......@@ -143,7 +147,9 @@ static int8_t GetCorrectDeviceIdByPlaceType(
Place::Place(paddle::PlaceType type)
: device(detail::GetCorrectDeviceIdByPlaceType(type)),
alloc_type_(static_cast<AllocationType>(type)),
device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) {
device_type_id_(
CustomRegisteredDeviceMap::Instance().GetOrRegisterGlobalDeviceTypeId(
"")) {
LOG_FIRST_N(WARNING, 1)
<< "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version "
"2.3, and will be removed in version 2.4! Please use "
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/phi/api/include/dll_decl.h"
......@@ -37,11 +38,21 @@ enum class AllocationType : int8_t {
CUSTOM = 9,
};
const char* AllocationTypeStr(AllocationType type);
class CustomRegisteredDeviceMap {
public:
static CustomRegisteredDeviceMap& Instance();
size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
std::string GetGlobalDeviceType(size_t device_type_id_);
std::string GetGlobalDeviceType(size_t device_type_id_);
private:
CustomRegisteredDeviceMap() = default;
std::unordered_map<std::string, size_t> registered_device_type_id_;
std::unordered_map<size_t, std::string> registered_device_type_;
};
const char* AllocationTypeStr(AllocationType type);
/// \brief The place is used to specify where the data is stored.
class PADDLE_API Place {
......@@ -53,12 +64,14 @@ class PADDLE_API Place {
const std::string& dev_type = "")
: device(id),
alloc_type_(type),
device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
explicit Place(AllocationType type, const std::string& dev_type = "")
: device(0),
alloc_type_(type),
device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
device_type_id_(phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
// See NOTE [ Why need to temporarily adapt to PlaceType? ]
Place(paddle::PlaceType type); // NOLINT
......@@ -69,7 +82,8 @@ class PADDLE_API Place {
alloc_type_ = type;
device = device_id;
if (!dev_type.empty()) {
device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type);
device_type_id_ = phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(dev_type);
}
}
......@@ -78,7 +92,8 @@ class PADDLE_API Place {
int8_t GetDeviceId() const { return device; }
std::string GetDeviceType() const {
return GetGlobalDeviceType(device_type_id_);
return phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
}
std::string DebugString() const;
......
......@@ -110,6 +110,7 @@ class ArgumentMappingContext {
virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
virtual bool IsSelectedRowsInputs(const std::string& name) const = 0;
virtual bool IsSparseCooTensorInput(const std::string& name) const = 0;
virtual bool IsSparseCooTensorOutput(const std::string& name) const = 0;
virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0;
// For compatibility with LoDTensorArray
virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
......
......@@ -46,7 +46,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
case AllocationType::CUSTOM:
return static_cast<Backend>(
static_cast<size_t>(Backend::NUM_BACKENDS) +
GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
phi::CustomRegisteredDeviceMap::Instance()
.GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"Unsupported transform %s to phi Backend.", place));
......@@ -91,7 +92,9 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
std::string device_type =
phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
device_type_id_);
if (!device_type.empty()) {
return phi::CustomPlace(
device_type,
......
......@@ -1010,6 +1010,18 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
#endif
/*
* CUDA Graph related FLAG
* Name: FLAGS_new_executor_use_cuda_graph
* Since Version: 2.4
* Value Range: bool, default=false
* Example: FLAGS_new_executor_use_cuda_graph=true would allow
* new executor to use CUDA Graph.
*/
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
false,
"Use CUDA Graph in new executor");
DEFINE_int32(record_pool_max_size,
2000000,
"SlotRecordDataset slot record pool max size");
......@@ -1181,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
false,
"Add a persistent ibuilder.");
/**
* mmap_allocator related FLAG
* Name: use_shm_cache
* Since Version: 2.5.0
* Value Range: bool, default=true
* Example:
* Note: . If True, mmap_allocator will cache shm file to decrease munmap
* operation.
*/
PADDLE_DEFINE_EXPORTED_bool(use_shm_cache,
true,
"Use shm cache in mmap_allocator.");
......@@ -101,6 +101,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout,
default_key.dtype(),
arg_type);
} else if (arg_type ==
std::type_index(typeid(const phi::ExtendedTensor&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
} else if (arg_type == std::type_index(typeid(
const std::vector<const ExtendedTensor*>&))) {
args_def->AppendInput(default_key.backend(),
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册