提交 f3a5d012 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

......@@ -118,7 +118,7 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
......@@ -151,6 +151,9 @@ function(kernel_library TARGET)
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc )
list(APPEND kps_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
endif()
else()
# TODO(chenweihang): impl compile by source later
......
......@@ -35,6 +35,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
namespace egr {
......
......@@ -32,6 +32,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
namespace egr {
......
......@@ -25,6 +25,12 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(conv2d_transpose, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(batch_norm, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT);
USE_OP_ITSELF(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
......
......@@ -23,7 +23,7 @@ cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
# skip win32 since wget is not installed by default on windows machine.
# skip COVERAGE_CI since the test runs slowly because of instrumentation.
if (WITH_CUDA AND WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
add_custom_target(
download_program
COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program
......
......@@ -41,6 +41,7 @@ namespace paddle {
namespace framework {
// NOTE(Aurelius84): Need a better strategy to determine it.
static constexpr size_t kHostNumThreads = 4;
static constexpr size_t kDeviceNumThreads = 1;
bool IsInterpretercoreFastGCEnabled() {
return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
......@@ -54,8 +55,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
global_scope_(global_scope),
stream_analyzer_(place) {
is_build_ = false;
async_work_queue_.reset(
new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
async_work_queue_.reset(new interpreter::AsyncWorkQueue(
kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsInterpretercoreFastGCEnabled()) {
......@@ -271,6 +272,10 @@ void InterpreterCore::Convert(
if (FLAGS_new_executor_use_inplace) {
BuildInplace();
}
// prepare for the first time.
async_work_queue_->PrepareAtomicDeps(dependecy_count_);
async_work_queue_->PrepareAtomicVarRef(vec_meta_info);
}
bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
......@@ -388,18 +393,18 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
: global_scope_->GetMutableScope();
auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
{
platform::RecordEvent infershape_event(
"infer_shape", platform::TracerEventType::OperatorInner, 1,
platform::EventRole::kInnerOp);
// If it is OperatorBase, InferShape do nothing.
if (op_with_kernel != nullptr)
if (op_with_kernel != nullptr) {
platform::RecordEvent infershape_event(
"infer_shape", platform::TracerEventType::OperatorInner, 1,
platform::EventRole::kInnerOp);
// If it is OperatorBase, InferShape do nothing.
op_with_kernel->Info().infer_shape_(
instr_node.InnerInferShapeContext().get());
}
}
if (op_with_kernel != nullptr &&
FLAGS_new_executor_use_inplace) { // TODO(xiongkun03) Does operator
// base support inplace ?
if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
// TODO(xiongkun03) Does operator base support inplace ?
for (auto& pair : instr_node.InplaceInfo()) {
const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
auto* out =
......@@ -409,6 +414,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
}
}
}
{
platform::RecordEvent compute_event(
"compute", platform::TracerEventType::OperatorInner, 1,
......@@ -458,16 +464,24 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
void InterpreterCore::ExecuteInstructionList(
const std::vector<Instruction>& vec_instr) {
// NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
// those for the next step
auto atomic_deps = async_work_queue_->AtomicDeps();
auto atomic_var_ref = async_work_queue_->AtomicVarRef();
async_work_queue_->PrepareAtomicDeps(dependecy_count_);
async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
unfinished_op_numer_ = vec_instr.size();
exception_holder_.Clear();
for (size_t i = 0; i < dependecy_count_.size(); ++i) {
if (dependecy_count_[i] == 0) {
async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
[&, i] { RunInstructionAsync(i); });
async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
this, i, atomic_deps = atomic_deps.get(),
atomic_var_ref = atomic_var_ref.get()
] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
}
}
......@@ -490,11 +504,16 @@ void InterpreterCore::ExecuteInstructionList(
}
void InterpreterCore::RunNextInstructions(
const Instruction& instr, std::queue<size_t>* reserved_next_ops) {
const Instruction& instr, std::queue<size_t>* reserved_next_ops,
std::vector<std::atomic<size_t>>* atomic_deps,
std::vector<std::atomic<size_t>>* atomic_var_ref) {
VLOG(4) << "atomic 1:" << atomic_deps;
auto& next_instr = instr.NextInstructions();
auto& atomic_deps = async_work_queue_->AtomicDeps();
auto IsReady = [&](size_t next_id) {
return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
auto IsReady = [atomic_deps](size_t next_id) {
VLOG(4) << "atomic:" << atomic_deps << " " << &(*atomic_deps)[next_id]
<< " " << next_id;
return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
};
if (instr.KernelType() == OpFuncType::kQueueAsync) {
......@@ -503,7 +522,9 @@ void InterpreterCore::RunNextInstructions(
if (IsReady(next_id)) {
async_work_queue_->AddTask(
vec_instruction_[next_id].KernelType(),
[&, next_id] { RunInstructionAsync(next_id); });
[this, next_id, atomic_deps, atomic_var_ref]() {
RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
});
}
}
// keep all async_ops running in current thread
......@@ -523,7 +544,9 @@ void InterpreterCore::RunNextInstructions(
if (IsReady(next_id)) {
async_work_queue_->AddTask(
vec_instruction_[next_id].KernelType(),
[&, next_id] { RunInstructionAsync(next_id); });
[this, next_id, atomic_deps, atomic_var_ref] {
RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
});
}
}
auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
......@@ -539,14 +562,18 @@ void InterpreterCore::RunNextInstructions(
// move rest ops into other threads
async_work_queue_->AddTask(
vec_instruction_[next_id].KernelType(),
[&, next_id] { RunInstructionAsync(next_id); });
[this, next_id, atomic_deps, atomic_var_ref] {
RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
});
}
}
if (first_op != 0) reserved_next_ops->push(first_op);
}
}
void InterpreterCore::RunInstructionAsync(size_t instr_id) {
void InterpreterCore::RunInstructionAsync(
size_t instr_id, std::vector<std::atomic<size_t>>* atomic_deps,
std::vector<std::atomic<size_t>>* atomic_var_ref) {
std::queue<size_t> ready_ops;
ready_ops.push(instr_id);
while (!ready_ops.empty()) {
......@@ -571,7 +598,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
RecordStreamForGC(instr_node);
#endif
CheckGC(instr_node);
CheckGC(instr_node, atomic_var_ref);
} catch (platform::EnforceNotMet& ex) {
framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
......@@ -605,7 +632,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
interpreter::RecordEvent(instr_node, place_);
RunNextInstructions(instr_node, &ready_ops);
RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
}
}
......@@ -703,17 +730,19 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
}
#endif
void InterpreterCore::CheckGC(const Instruction& instr) {
void InterpreterCore::CheckGC(
const Instruction& instr,
std::vector<std::atomic<size_t>>* atomic_var_ref) {
size_t instr_id = instr.Id();
auto& var_scope = *global_scope_;
auto& atomic_var_ref = async_work_queue_->AtomicVarRef();
for (auto var_id : instr.GCCheckVars()) {
VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " "
<< var_scope.VarDesc(var_id);
VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
<< " " << var_id;
bool is_ready =
atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
(*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
// ignore all persistable var while GC
if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
continue;
......
......@@ -76,11 +76,16 @@ class InterpreterCore {
void RecordStreamForGC(const Instruction& instr);
#endif
void CheckGC(const Instruction& instr);
void CheckGC(const Instruction& instr,
std::vector<std::atomic<size_t>>* atomic_var_ref);
void RunInstructionAsync(size_t instr_id);
void RunInstructionAsync(size_t instr_id,
std::vector<std::atomic<size_t>>* atomic_deps,
std::vector<std::atomic<size_t>>* atomic_var_ref);
void RunNextInstructions(const Instruction& instr_id,
std::queue<size_t>* reserved_next_ops);
std::queue<size_t>* reserved_next_ops,
std::vector<std::atomic<size_t>>* atomic_deps,
std::vector<std::atomic<size_t>>* atomic_var_ref);
void BuildSkipShareLoDInfo();
......
......@@ -44,32 +44,37 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
using VariableIdMap = std::map<std::string, std::vector<int>>;
AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
void AsyncWorkQueue::PrepareAtomicDeps(
const std::vector<size_t>& dependecy_count) {
if (atomic_deps_.size() != dependecy_count.size()) {
atomic_deps_.clear();
std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(),
[] { return std::make_unique<std::atomic<size_t>>(0); });
}
for (size_t i = 0; i < dependecy_count.size(); ++i) {
atomic_deps_[i]->store(dependecy_count[i]);
}
return atomic_deps_;
VLOG(4) << "PrepareAtomicDeps";
auto p = std::make_shared<
std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
atomic_deps_ = p->get_future();
queue_group_->AddTask(2, [&dependecy_count, p] {
auto* op_deps =
new std::vector<std::atomic<size_t>>(dependecy_count.size());
for (size_t i = 0; i < dependecy_count.size(); ++i) {
(*op_deps)[i] = dependecy_count[i];
}
VLOG(4) << "AtomicDeps:" << op_deps << " " << (*op_deps).size();
p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(op_deps));
});
}
AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef(
void AsyncWorkQueue::PrepareAtomicVarRef(
const std::vector<VariableMetaInfo>& vec_meta_info) {
if (atomic_var_ref_.size() != vec_meta_info.size()) {
atomic_var_ref_.clear();
std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(),
[] { return std::make_unique<std::atomic<size_t>>(0); });
}
for (size_t i = 0; i < vec_meta_info.size(); ++i) {
atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_);
}
return atomic_var_ref_;
VLOG(4) << "PrepareAtomicVarRef";
auto p = std::make_shared<
std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
atomic_var_ref_ = p->get_future();
queue_group_->AddTask(2, [&vec_meta_info, p] {
auto* var_ref = new std::vector<std::atomic<size_t>>(vec_meta_info.size());
for (size_t i = 0; i < vec_meta_info.size(); ++i) {
(*var_ref)[i] = vec_meta_info[i].var_ref_count_;
}
VLOG(4) << "AtomicVarRef:" << var_ref << " " << (*var_ref).size();
p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(var_ref));
});
}
bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {
......
......@@ -50,11 +50,13 @@ namespace framework {
namespace interpreter {
using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
using AtomicVectorSizeT =
std::future<std::unique_ptr<std::vector<std::atomic<size_t>>>>;
class AsyncWorkQueue {
public:
AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter)
AsyncWorkQueue(size_t host_num_threads, size_t deivce_num_threads,
EventsWaiter* waiter)
: host_num_thread_(host_num_threads) {
std::vector<WorkQueueOptions> group_options;
// for execute host Kernel
......@@ -66,6 +68,13 @@ class AsyncWorkQueue {
/*events_waiter*/ waiter);
// for launch device Kernel
group_options.emplace_back(/*name*/ "DeviceKernelLaunch",
/*num_threads*/ deivce_num_threads,
/*allow_spinning*/ true,
/*track_task*/ false,
/*detached*/ true,
/*events_waiter*/ waiter);
// for prepare deps and others
group_options.emplace_back(/*name*/ "Prepare",
/*num_threads*/ 1,
/*allow_spinning*/ true,
/*track_task*/ false,
......@@ -74,10 +83,8 @@ class AsyncWorkQueue {
queue_group_ = CreateWorkQueueGroup(group_options);
}
AtomicVectorSizeT& PrepareAtomicDeps(
const std::vector<size_t>& dependecy_count);
AtomicVectorSizeT& PrepareAtomicVarRef(
const std::vector<VariableMetaInfo>& vec_meta_info);
void PrepareAtomicDeps(const std::vector<size_t>& dependecy_count);
void PrepareAtomicVarRef(const std::vector<VariableMetaInfo>& vec_meta_info);
// void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
......@@ -85,8 +92,12 @@ class AsyncWorkQueue {
void Cancel() { queue_group_->Cancel(); }
AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; }
AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; }
std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicDeps() {
return atomic_deps_.get();
}
std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicVarRef() {
return atomic_var_ref_.get();
}
private:
size_t host_num_thread_;
......
......@@ -20,45 +20,65 @@
// #include "gperftools/profiler.h"
#include "paddle/fluid/framework/new_executor/standalone_executor.h"
#include "paddle/phi/core/kernel_registry.h"
USE_OP_ITSELF(fill_constant);
USE_OP(uniform_random);
USE_OP_ITSELF(uniform_random);
USE_OP(lookup_table);
USE_OP(transpose2);
USE_OP_ITSELF(transpose2);
USE_OP_ITSELF(reshape2);
USE_OP(split);
USE_OP(slice);
USE_OP(concat);
USE_OP(matmul);
USE_OP_ITSELF(split);
USE_OP_ITSELF(slice);
USE_OP_ITSELF(concat);
USE_OP_ITSELF(matmul);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(sigmoid);
USE_OP_ITSELF(tanh);
USE_OP(elementwise_mul);
USE_OP_ITSELF(elementwise_mul);
USE_OP(softmax_with_cross_entropy);
USE_OP_ITSELF(reduce_mean);
USE_OP_ITSELF(reduce_sum);
USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(reduce_mean_grad);
USE_OP_ITSELF(reshape2_grad);
USE_OP(softmax_with_cross_entropy_grad);
USE_OP_ITSELF(softmax_with_cross_entropy_grad);
USE_OP_ITSELF(elementwise_add_grad);
USE_OP(matmul_grad);
USE_OP(square);
USE_OP(transpose2_grad);
USE_OP_ITSELF(matmul_grad);
USE_OP_ITSELF(square);
USE_OP_ITSELF(transpose2_grad);
USE_OP(concat_grad);
USE_OP_ITSELF(elementwise_mul_grad);
USE_OP_ITSELF(sigmoid_grad);
USE_OP_ITSELF(tanh_grad);
USE_OP(sum);
USE_OP(slice_grad);
USE_OP(lookup_table_grad);
USE_OP_ITSELF(slice_grad);
USE_OP_ITSELF(lookup_table_grad);
USE_OP(sqrt);
USE_OP(elementwise_max);
USE_OP_ITSELF(elementwise_div);
USE_OP(sgd);
USE_OP_ITSELF(sgd);
USE_OP(squared_l2_norm);
USE_OP(memcpy_h2d);
USE_OP(memcpy_d2h);
USE_OP_ITSELF(memcpy_h2d);
USE_OP_ITSELF(memcpy_d2h);
PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(uniform_random_raw, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(reshape_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(transpose_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
DECLARE_double(eager_delete_tensor_gb);
namespace paddle {
......
......@@ -1122,7 +1122,15 @@ static void CheckTensorNANOrInf(const std::string& op_type,
bool OperatorWithKernel::SupportsMKLDNN(
const proto::VarType::Type data_type) const {
auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
"Registered Kernels. And We don't "
"search its kernels in phi lib, "
"SupportsMKLDNN() return false.";
return false;
}
auto& op_kernels = op_kernel_iter->second;
return std::any_of(op_kernels.begin(), op_kernels.end(),
[data_type](OpKernelMap::const_reference kern_pair) {
return platform::is_cpu_place(kern_pair.first.place_) &&
......
......@@ -144,6 +144,9 @@ class Scope : public ScopeBase {
void Rename(const std::string& origin_name,
const std::string& new_name) const;
// Return the number of variables in scope
size_t Size() { return vars_.size(); }
// Rename variable to a new name and return the new name
std::string Rename(const std::string& origin_name) const;
......
......@@ -32,6 +32,8 @@
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
......
......@@ -1496,6 +1496,9 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
HardSigmoidGradFunctor);
REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
LogSigmoidGradFunctor);
REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
/* ========================== sigmoid register =============================
*/
......@@ -1867,15 +1870,6 @@ REGISTER_OPERATOR(
ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CPU_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== register checkpoint ===========================*/
......
......@@ -281,6 +281,11 @@ USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
USE_PHI_FUNCTOR(LogSigmoid)
USE_PHI_FUNCTOR(HardSigmoid)
USE_PHI_FUNCTOR(Log)
USE_PHI_DOUBLE_GRAD_FUNCTOR(Log)
USE_PHI_FUNCTOR(Log2)
USE_PHI_FUNCTOR(Log10)
USE_PHI_FUNCTOR(Log1p)
template <typename T>
using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
......@@ -448,88 +453,6 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
}
};
// log(x) = natural logarithm of x
template <typename T>
struct LogFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.log();
}
};
template <typename T>
struct LogGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) / x);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// log2(x) = logarithm to the base 2 of the elements of x
template <typename T>
struct Log2Functor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.log() / static_cast<T>(log(2));
}
};
// the gradient of log2(x) is 1/(x*ln(2))
template <typename T>
struct Log2GradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// log10(x) = logarithm to the base 10 of the elements of x
template <typename T>
struct Log10Functor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.log() / static_cast<T>(log(10));
}
};
// the gradient of log10(x) is 1/(x*ln(10))
template <typename T>
struct Log10GradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// log1p(x) = natural logarithm of x+1
template <typename T>
struct Log1pFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = (static_cast<T>(1) + x).log();
}
};
template <typename T>
struct Log1pGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// square(x) = x^2
template <typename T>
struct SquareFunctor : public BaseActivationFunctor<T> {
......@@ -1197,37 +1120,6 @@ class SquareDoubleGradKernel
}
};
template <typename DeviceContext, typename Functor>
class LogDoubleGradKernel
: public SquareDoubleGradKernel<DeviceContext, Functor> {};
template <typename DeviceContext, typename Functor>
class ELUDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& ctx) const override {
const framework::Tensor *X, *ddX, *dOut;
X = ddX = dOut = nullptr;
framework::Tensor *dX, *ddOut;
dX = ddOut = nullptr;
ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
auto& place = ctx.template device_context<DeviceContext>();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(place, X, ddX, ddOut, dOut, dX);
}
};
template <typename DeviceContext, typename Functor>
class CELUDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
......@@ -1522,36 +1414,6 @@ class LogitGradKernel : public framework::OpKernel<T> {
}
};
template <typename T>
struct LogGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* ddX, framework::Tensor* ddOut,
const framework::Tensor* dOut, framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
// ddout = ddx / x; dx = -(dout / x) * (ddx / x)
// calculate dx first, so ddout can inplace ddx
if (dX) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
auto dx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
ddout.device(*d) = ddx * static_cast<T>(1) / x;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
} // namespace operators
} // namespace paddle
......@@ -1560,9 +1422,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \
__macro(log2, Log2, Log2Functor, Log2GradFunctor); \
__macro(log10, Log10, Log10Functor, Log10GradFunctor); \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
__macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
......
......@@ -131,27 +131,6 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaLogFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// log(x) = log(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log(x));
}
};
template <typename T>
struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
// dx = dout / x
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / x;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaSquareFunctor : public BaseActivationFunctor<T> {
// square(x) = x * x
......@@ -220,78 +199,6 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// log1p(x) = log(1 + x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log(one + x));
}
};
template <typename T>
struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout / (1 + x)
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / (one + x);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLog2Functor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// log2(x) = log2(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log2(x));
}
};
template <typename T>
struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
// dx = dout / (x * log(2))
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / (x * log_two);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLog10Functor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// log10(x) = log10(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log10(x));
}
};
template <typename T>
struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
// dx = dout / (x * log(10))
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / (x * log_ten);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
......@@ -773,6 +680,10 @@ USE_PHI_FUNCTOR(CudaELU)
USE_PHI_FUNCTOR(CudaSigmoid)
USE_PHI_FUNCTOR(CudaLogSigmoid)
USE_PHI_FUNCTOR(CudaHardSigmoid)
USE_PHI_FUNCTOR(CudaLog)
USE_PHI_FUNCTOR(CudaLog2)
USE_PHI_FUNCTOR(CudaLog10)
USE_PHI_FUNCTOR(CudaLog1p)
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
......@@ -975,18 +886,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::CudaExpm1GradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
REGISTER_OP_CUDA_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \
CudaSoftShrinkGradFunctor); \
......@@ -995,9 +894,6 @@ REGISTER_OP_CUDA_KERNEL(
__macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor); \
__macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \
CudaReciprocalGradFunctor); \
__macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \
__macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \
__macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \
__macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
__macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \
__macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
#include "paddle/fluid/operators/batch_size_like.h"
namespace paddle {
......@@ -23,9 +22,13 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
using BatchSizeLikeOp::BatchSizeLikeOp;
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::OpKernelType kernel_type = framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context());
if (ctx.Attr<bool>("force_cpu")) {
kernel_type.place_ = platform::CPUPlace();
}
return kernel_type;
}
};
......@@ -64,15 +67,3 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::FillConstantBatchSizeLikeOpMaker,
ops::BatchSizeLikeNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
double>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
int>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
int64_t>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
bool>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto float_value = ctx.Attr<float>("value");
auto str_value = ctx.Attr<std::string>("str_value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
auto *out = ctx.Output<framework::Tensor>("Out");
auto *in = ctx.Input<framework::LoDTensor>("Input");
if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
// set the correct batch size for the LoDTensor.
auto odims = out->dims();
int output_dim_idx = ctx.Attr<int>("output_dim_idx");
odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
out->mutable_data<T>(odims, ctx.GetPlace());
}
T value;
if (str_value.empty()) {
value = static_cast<T>(float_value);
} else {
std::stringstream convert_stream(str_value);
if (std::is_same<int64_t, T>::value) {
int64_t tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
} else {
double tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
}
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
if (cpu_place) {
auto &dev_ctx = *pool.Get(platform::CPUPlace());
phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
out->mutable_data(platform::CPUPlace(),
framework::TransToPhiDataType(data_type));
functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
out, static_cast<T>(value));
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!cpu_place) {
auto &dev_ctx = *pool.Get(ctx.GetPlace());
phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
out->mutable_data(ctx.GetPlace(),
framework::TransToPhiDataType(data_type));
functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
out, static_cast<T>(value));
}
#endif
}
};
} // namespace operators
} // namespace paddle
......@@ -19,7 +19,9 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -113,13 +115,13 @@ class SqueezeOp : public framework::OperatorWithKernel {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
//#ifdef PADDLE_WITH_MKLDNN
// #ifdef PADDLE_WITH_MKLDNN
// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// return framework::OpKernelType(input_data_type, ctx.GetPlace(),
// framework::DataLayout::kMKLDNN,
// framework::LibraryType::kMKLDNN);
// }
//#endif
// #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
......@@ -140,13 +142,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
//#ifdef PADDLE_WITH_MKLDNN
// #ifdef PADDLE_WITH_MKLDNN
// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// return framework::OpKernelType(input_data_type, ctx.GetPlace(),
// framework::DataLayout::kMKLDNN,
// framework::LibraryType::kMKLDNN);
// }
//#endif
// #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
......@@ -201,53 +203,18 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
class Squeeze2Op : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Squeeze2");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Squeeze2");
const auto &x_dims = ctx->GetInputDim("X");
// Check input tensor dims (<6) Eigen limit.
PADDLE_ENFORCE_LE(x_dims.size(), 6,
platform::errors::InvalidArgument(
"The dimensions of Input(X) "
"should be in the range of [1, 6] (Eigen limit)."
"But received X's dimensions = %d, X's shape = [%s].",
x_dims.size(), x_dims));
const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
auto out_dims = GetOutputShape(axes, x_dims, false);
ctx->SetOutputDim("Out", out_dims);
if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", "Out");
}
if (!ctx->HasOutput("XShape")) return;
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
//#ifdef PADDLE_WITH_MKLDNN
// #ifdef PADDLE_WITH_MKLDNN
// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// return framework::OpKernelType(input_data_type, ctx.GetPlace(),
// framework::DataLayout::kMKLDNN,
// framework::LibraryType::kMKLDNN);
// }
//#endif
// #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
......@@ -287,13 +254,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
//#ifdef PADDLE_WITH_MKLDNN
// #ifdef PADDLE_WITH_MKLDNN
// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// return framework::OpKernelType(input_data_type, ctx.GetPlace(),
// framework::DataLayout::kMKLDNN,
// framework::LibraryType::kMKLDNN);
// }
//#endif
// #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
......@@ -365,6 +332,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(squeeze2, SqueezeInferShapeFunctor,
PD_INFER_META(phi::SqueezeInferMeta));
REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
......@@ -376,7 +347,7 @@ REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
ops::SqueezeInplaceInferer);
ops::SqueezeInplaceInferer, SqueezeInferShapeFunctor);
REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
......@@ -411,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
paddle::platform::complex<double>>,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
squeeze2_grad,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
......@@ -46,33 +46,3 @@ REGISTER_OP_CUDA_KERNEL(
paddle::platform::complex<float>>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
squeeze2_grad,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
......@@ -18,7 +18,9 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -251,19 +253,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
class Unsqueeze2Op : public UnsqueezeOp {
public:
using UnsqueezeOp::UnsqueezeOp;
void InferShape(framework::InferShapeContext *ctx) const override {
UnsqueezeOp::InferShape(ctx);
const auto &x_dims = ctx->GetInputDim("X");
if (!ctx->HasOutput("XShape")) return;
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
......@@ -339,10 +328,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
} // namespace operators
} // namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2, Unsqueeze2InferShapeFunctor,
PD_INFER_META(phi::UnsqueezeInferMeta));
namespace ops = paddle::operators;
REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
......@@ -351,7 +344,8 @@ REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
ops::Unsqueeze2GradOpMaker<paddle::framework::OpDesc>,
ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
ops::UnsqueezeInplaceInferer);
Unsqueeze2InferShapeFunctor, ops::UnsqueezeInplaceInferer);
REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
......@@ -388,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
paddle::platform::complex<double>>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int16_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
unsqueeze2_grad,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int16_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
......@@ -50,37 +50,3 @@ REGISTER_OP_CUDA_KERNEL(
paddle::platform::complex<float>>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
unsqueeze2,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int16_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
unsqueeze2_grad,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int16_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
......@@ -1761,6 +1761,7 @@ All parameter, weight, gradient are variables in Paddle.
out (core.Variable|None): the found variable or None.
)DOC",
py::return_value_policy::reference)
.def("size", &Scope::Size)
.def("erase", &Scope::EraseVars, py::arg("names"),
R"DOC(
Find variable named :code:`name` in the current scope or
......@@ -2857,6 +2858,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("run",
[](StandaloneExecutor &self, std::vector<std::string> feed_names,
std::vector<std::string> fetch_names) {
platform::RecordEvent record_event(
"StandaloneExecutor:run",
platform::TracerEventType::UserDefined, 1);
paddle::framework::FetchList ret;
{
pybind11::gil_scoped_release release;
......
......@@ -42,6 +42,10 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
"flatten_grad",
"isinf",
"isnan",
"unsqueeze",
"unsqueeze_grad",
"squeeze",
"squeeze_grad",
"isfinite",
"matmul",
"matmul_grad",
......
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/unfold_functor.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
namespace phi {
......@@ -1497,6 +1498,40 @@ void SplitInferMeta(const MetaTensor& x,
}
}
void SqueezeInferMeta(const MetaTensor& x,
const std::vector<int>& axes,
MetaTensor* xshape,
MetaTensor* out) {
const auto& x_dims = x.dims();
// Check input tensor dims (<6) Eigen limit.
PADDLE_ENFORCE_LE(x_dims.size(),
6,
phi::errors::InvalidArgument(
"The dimensions of Input(X) "
"should be in the range of [1, 6] (Eigen limit)."
"But received X's dimensions = %d, X's shape = [%s].",
x_dims.size(),
x_dims));
auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, false);
out->set_dims(out_dims);
if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
out->share_lod(x);
}
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
xshape->set_dims(phi::make_ddim(xshape_dims));
xshape->share_lod(x);
xshape->set_dtype(x.dtype());
out->set_dtype(x.dtype());
}
/* Why not use SumRawInferMeta directly?
Because we need make InferMetaFunction's args follow the design of api.yaml
*/
......@@ -1982,6 +2017,41 @@ void UnfoldInferMeta(const MetaTensor& x,
out->set_dims(phi::make_ddim(out_dims));
}
void UnsqueezeInferMeta(const MetaTensor& x,
const ScalarArray& axes,
MetaTensor* xshape,
MetaTensor* out) {
const auto& x_dims = x.dims();
// Validity Check: input tensor dims (<6).
PADDLE_ENFORCE_LE(x_dims.size(),
6,
phi::errors::InvalidArgument(
"Invalid "
"dimensions, the rank of Input(X) "
"should be in the range of [1, 6] (Eigen limit)"));
if (!axes.GetData().empty()) {
std::vector<int32_t> tmp;
tmp.reserve(axes.GetData().size());
std::for_each(axes.GetData().begin(),
axes.GetData().end(),
[&tmp](const int64_t& t) { tmp.push_back(t); });
auto out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
out->set_dims(out_dims);
if (x_dims[0] == out_dims[0]) {
out->share_lod(x);
}
}
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
xshape->set_dims(phi::make_ddim(xshape_dims));
xshape->share_lod(x);
out->set_dtype(x.dtype());
xshape->set_dtype(x.dtype());
}
void OneHotRawInferMeta(const MetaTensor& x,
int32_t depth,
DataType dtype,
......@@ -1992,7 +2062,6 @@ void OneHotRawInferMeta(const MetaTensor& x,
x_dims.size(),
1,
phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
auto out_dims_vec = phi::vectorize(x_dims);
out_dims_vec.push_back(depth);
auto out_dims = phi::make_ddim(out_dims_vec);
......
......@@ -229,6 +229,11 @@ void SplitInferMeta(const MetaTensor& x_meta,
std::vector<MetaTensor*> out,
MetaConfig config = MetaConfig());
void SqueezeInferMeta(const MetaTensor& x,
const std::vector<int>& axes,
MetaTensor* xshape,
MetaTensor* out);
void SumInferMeta(const MetaTensor& x,
const std::vector<int64_t>& axis,
DataType dtype,
......@@ -290,6 +295,11 @@ void UnfoldInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());
void UnsqueezeInferMeta(const MetaTensor& x,
const ScalarArray& axes,
MetaTensor* xshape,
MetaTensor* out);
void OneHotRawInferMeta(const MetaTensor& x,
int32_t depth,
DataType dtype,
......
......@@ -62,3 +62,6 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $
add_subdirectory(sparse)
copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
# 5. kernel autotune
add_subdirectory(autotune)
......@@ -135,6 +135,14 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
DenseTensor* d_dout,
DenseTensor* d_ddx);
template <typename T, typename Context>
void LogDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
const DenseTensor& ddx,
DenseTensor* dx,
DenseTensor* ddout);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
......@@ -149,6 +157,10 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
......
......@@ -56,6 +56,10 @@ DECLARE_ACTIVATION_KERNEL(TanhShrink)
DECLARE_ACTIVATION_KERNEL(Silu)
DECLARE_ACTIVATION_KERNEL(Sigmoid)
DECLARE_ACTIVATION_KERNEL(LogSigmoid)
DECLARE_ACTIVATION_KERNEL(Log)
DECLARE_ACTIVATION_KERNEL(Log2)
DECLARE_ACTIVATION_KERNEL(Log10)
DECLARE_ACTIVATION_KERNEL(Log1p)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
......
if (WITH_GPU)
nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
elseif (WITH_ROCM)
hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
endif()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace phi {
class GpuTimer {
public:
GpuTimer() {
#ifdef PADDLE_WITH_HIP
hipEventCreate(&start_);
hipEventCreate(&stop_);
#else
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
#endif
PADDLE_ENFORCE_NOT_NULL(
start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
PADDLE_ENFORCE_NOT_NULL(
stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
}
~GpuTimer() {
#ifdef PADDLE_WITH_HIP
hipEventDestroy(start_);
hipEventDestroy(stop_);
#else
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
#endif
}
void Start(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
hipEventRecord(start_, stream);
#else
cudaEventRecord(start_, stream);
#endif
}
void Stop(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
hipEventRecord(stop_, stream);
#else
cudaEventRecord(stop_, stream);
#endif
}
float ElapsedTime() {
float milliseconds = 0;
#ifdef PADDLE_WITH_HIP
hipEventSynchronize(stop_);
hipEventElapsedTime(&milliseconds, start_, stop_);
#else
cudaEventSynchronize(stop_);
cudaEventElapsedTime(&milliseconds, start_, stop_);
#endif
return milliseconds;
}
private:
gpuEvent_t start_;
gpuEvent_t stop_;
};
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <functional>
#include "glog/logging.h"
#include "paddle/phi/kernels/autotune/gpu_timer.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
template <typename T, int VecSize>
__global__ void VecSum(T *x, T *y, int N) {
#ifdef __HIPCC__
int idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
#else
int idx = blockDim.x * blockIdx.x + threadIdx.x;
#endif
using LoadT = phi::AlignedVector<T, VecSize>;
for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) {
LoadT x_vec;
LoadT y_vec;
phi::Load<T, VecSize>(&x[i], &x_vec);
phi::Load<T, VecSize>(&y[i], &y_vec);
#pragma unroll
for (int j = 0; j < VecSize; j++) {
y_vec[j] = x_vec[j] + y_vec[j];
}
phi::Store<T, VecSize>(y_vec, &y[i]);
}
}
template <int Vecsize, int Threads, size_t Blocks>
void Algo(float *d_in, float *d_out, size_t N) {
#ifdef __HIPCC__
hipLaunchKernelGGL(HIP_KERNEL_NAME(VecSum<float, Vecsize>),
dim3(Blocks),
dim3(Threads),
0,
0,
d_in,
d_out,
N);
#else
VecSum<float, Vecsize><<<Blocks, Threads>>>(d_in, d_out, N);
#endif
}
TEST(GpuTimer, Sum) {
float *in1, *in2, *out;
float *d_in1, *d_in2;
size_t N = 1 << 20;
size_t size = sizeof(float) * N;
#ifdef __HIPCC__
hipMalloc(reinterpret_cast<void **>(&d_in1), size);
hipMalloc(reinterpret_cast<void **>(&d_in2), size);
#else
cudaMalloc(reinterpret_cast<void **>(&d_in1), size);
cudaMalloc(reinterpret_cast<void **>(&d_in2), size);
#endif
in1 = reinterpret_cast<float *>(malloc(size));
in2 = reinterpret_cast<float *>(malloc(size));
out = reinterpret_cast<float *>(malloc(size));
for (size_t i = 0; i < N; i++) {
in1[i] = 1.0f;
in2[i] = 2.0f;
}
#ifdef __HIPCC__
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
#else
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
#endif
using Functor = std::function<void(float *, float *, size_t)>;
Functor alog0 = Algo<4, 256, 1024>;
Functor algo1 = Algo<1, 256, 1024>;
Functor alog2 = Algo<1, 256, 8>;
std::vector<Functor> algos = {alog0, algo1, alog2};
for (int j = 0; j < algos.size(); ++j) {
auto algo = algos[j];
phi::GpuTimer timer;
timer.Start(0);
algo(d_in1, d_in2, N);
timer.Stop(0);
VLOG(3) << "alog: " << j << " cost: " << timer.ElapsedTime() << "ms";
}
#ifdef __HIPCC__
hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
#else
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
#endif
free(in1);
free(in2);
free(out);
#ifdef __HIPCC__
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1);
cudaFree(d_in2);
#endif
}
......@@ -121,6 +121,10 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
......@@ -233,3 +237,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(log_double_grad, LogDoubleGradKernel)
......@@ -74,6 +74,10 @@ DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Log2, Log2Functor)
DEFINE_CPU_ACTIVATION_KERNEL(Log10, Log10Functor)
DEFINE_CPU_ACTIVATION_KERNEL(Log1p, Log1pFunctor)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
......@@ -118,3 +122,7 @@ PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/squeeze_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
PD_REGISTER_KERNEL(squeeze_grad,
CPU,
ALL_LAYOUT,
phi::SqueezeGradKernel,
float,
double,
phi::dtype::bfloat16,
bool,
int,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/squeeze_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
PD_REGISTER_KERNEL(squeeze,
CPU,
ALL_LAYOUT,
phi::SqueezeKernel,
float,
double,
phi::dtype::bfloat16,
bool,
int,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
PD_REGISTER_KERNEL(unsqueeze_grad,
CPU,
ALL_LAYOUT,
phi::UnsqueezeGradKernel,
phi::dtype::bfloat16,
bool,
int,
int16_t,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unsqueeze_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
PD_REGISTER_KERNEL(unsqueeze,
CPU,
ALL_LAYOUT,
phi::UnsqueezeKernel,
float,
double,
phi::dtype::bfloat16,
bool,
int,
int16_t,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void FullBatchSizeLikeKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& shape,
const Scalar& val,
DataType dtype,
int x_batch_size_dim,
int out_batch_size_dim,
DenseTensor* out) {
if (x.lod().size() && x_batch_size_dim == 0) {
// set the correct batch size for the LoDTensor.
auto odims = out->dims();
odims[out_batch_size_dim] = static_cast<int>(x.lod().back().size()) - 1;
FullKernel<T, Context>(dev_ctx, phi::vectorize(odims), val, dtype, out);
}
FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, out);
}
} // namespace phi
PD_REGISTER_KERNEL(full_batch_size_like,
CPU,
ALL_LAYOUT,
phi::FullBatchSizeLikeKernel,
float,
double,
int,
int64_t,
bool) {
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(full_batch_size_like,
GPU,
ALL_LAYOUT,
phi::FullBatchSizeLikeKernel,
float,
double,
int,
int64_t,
bool,
phi::dtype::float16) {
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
}
#endif
......@@ -14,6 +14,8 @@
#pragma once
#include <vector>
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -37,6 +39,18 @@ void FullLikeKernel(const Context& dev_ctx,
DataType dtype,
DenseTensor* out);
// In order to be compatible with fill_constant_batch_size_like op
// that are still used in the 2.x APIs
template <typename T, typename Context>
void FullBatchSizeLikeKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& shape,
const Scalar& val,
DataType dtype,
int x_batch_size_dim,
int out_batch_size_dim,
DenseTensor* out);
template <typename T, typename Context>
void Full(const Context& dev_ctx,
const ScalarArray& shape,
......
......@@ -1223,6 +1223,133 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
}
};
// log(x) = natural logarithm of x
template <typename T>
struct LogFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.log();
}
};
template <typename T>
struct LogGradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) / x);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// log2(x) = logarithm to the base 2 of the elements of x
template <typename T>
struct Log2Functor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.log() / static_cast<T>(log(2));
}
};
// the gradient of log2(x) is 1/(x*ln(2))
template <typename T>
struct Log2GradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// log10(x) = logarithm to the base 10 of the elements of x
template <typename T>
struct Log10Functor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.log() / static_cast<T>(log(10));
}
};
// the gradient of log10(x) is 1/(x*ln(10))
template <typename T>
struct Log10GradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// log1p(x) = natural logarithm of x+1
template <typename T>
struct Log1pFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = (static_cast<T>(1) + x).log();
}
};
template <typename T>
struct Log1pGradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct LogGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev,
const DenseTensor* X,
const DenseTensor* ddX,
DenseTensor* ddOut,
const DenseTensor* dOut,
DenseTensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
auto x = EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
// ddout = ddx / x; dx = -(dout / x) * (ddx / x)
// calculate dx first, so ddout can inplace ddx
if (dX) {
auto dout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
auto dx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
}
if (ddOut) {
auto ddout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
ddout.device(*d) = ddx * static_cast<T>(1) / x;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
template <typename T>
struct CudaReluFunctor : public BaseActivationFunctor<T> {
......@@ -1970,6 +2097,99 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaLogFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
// log(x) = log(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log(x));
}
};
template <typename T>
struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
// dx = dout / x
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / x;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// log1p(x) = log(1 + x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log(one + x));
}
};
template <typename T>
struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout / (1 + x)
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / (one + x);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLog2Functor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
// log2(x) = log2(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log2(x));
}
};
template <typename T>
struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
// dx = dout / (x * log(2))
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / (x * log_two);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLog10Functor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
// log10(x) = log10(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(log10(x));
}
};
template <typename T>
struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
// dx = dout / (x * log(10))
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return dout / (x * log_ten);
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
#endif
} // namespace funcs
......
......@@ -21,6 +21,118 @@
namespace phi {
namespace funcs {
inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
const DDim& in_dims,
bool is_runtime) {
size_t num_squeeze_dims = squeeze_dims.size();
std::vector<bool> should_squeeze(in_dims.size(), false);
// Mark dimensions need to be squeezed.
if (num_squeeze_dims == 0) {
for (int i = 0; i < in_dims.size(); ++i) {
if (in_dims[i] == 1) {
should_squeeze[i] = true;
}
}
} else {
for (size_t i = 0; i < num_squeeze_dims; ++i) {
int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
: squeeze_dims[i];
PADDLE_ENFORCE_GE(
current,
0,
phi::errors::InvalidArgument(
"Each axis in Attr(axes) should be in the range of [%d, %d]"
"But current axis is:%d, input tensor's shape = [%s].",
-in_dims.size(),
in_dims.size() - 1,
current,
in_dims));
PADDLE_ENFORCE_LT(
current,
in_dims.size(),
phi::errors::InvalidArgument(
"Each axis in Attr(axes) should be in the range of [%d, %d]"
"But current axis is:%d, input tensor's shape = [%s].",
-in_dims.size(),
in_dims.size() - 1,
current,
in_dims));
if (!should_squeeze[current]) {
if (is_runtime) {
// At run time, dim of 1 is allowed to squeeze
if (in_dims[current] == 1) {
should_squeeze[current] = true;
}
} else {
// At compile time, dim of -1 or 1 is allowed to squeeze
if (in_dims[current] == 1 || in_dims[current] == -1) {
should_squeeze[current] = true;
}
}
}
}
}
// Make output dimensions
std::vector<int64_t> output_shape;
for (int i = 0; i < in_dims.size(); ++i) {
if (!should_squeeze[i]) {
output_shape.push_back(in_dims[i]);
}
}
return phi::make_ddim(output_shape);
}
inline DDim GetUnsqueezeShape(const std::vector<int> unsqz_dims,
const DDim& in_dims) {
int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
int cur_output_size = in_dims.size();
std::vector<int64_t> output_shape(output_size, 0);
// Validity Check: rank range.
PADDLE_ENFORCE_LE(
output_size,
6,
phi::errors::InvalidArgument("The output "
"tensor's rank should be less than 6."));
for (int axis : unsqz_dims) {
int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
// Vaildity Check: the axis bound
PADDLE_ENFORCE_GE(
cur,
0,
phi::errors::InvalidArgument("The insert dimension value should "
"not be less than 0"));
PADDLE_ENFORCE_LE(cur,
cur_output_size,
phi::errors::InvalidArgument(
"The insert dimension value shoule not be larger "
"than the dimension size of input tensor"));
// Move old axis, and insert new axis
for (int i = cur_output_size; i >= cur; --i) {
if (output_shape[i] == 1) {
// Move axis
output_shape[i + 1] = 1;
output_shape[i] = 0;
}
}
output_shape[cur] = 1;
// Add the output size.
cur_output_size++;
}
// Make output shape
for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
if (output_shape[out_idx] == 0) {
output_shape[out_idx] = in_dims[in_idx++];
}
}
return phi::make_ddim(output_shape);
}
inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
// don't copy data, only change the dims
......
......@@ -177,6 +177,10 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
CudaLeakyReluGradFunctor,
......@@ -300,3 +304,14 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
PD_REGISTER_KERNEL(log_double_grad,
GPU,
ALL_LAYOUT,
phi::LogDoubleGradKernel,
float,
double,
phi::dtype::float16) {}
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/impl/activation_grad_impl.h"
#include "paddle/phi/kernels/impl/activation_impl.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
......@@ -93,6 +93,10 @@ DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Log2, CudaLog2Functor)
DEFINE_GPU_ACTIVATION_KERNEL(Log10, CudaLog10Functor)
DEFINE_GPU_ACTIVATION_KERNEL(Log1p, CudaLog1pFunctor)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
......@@ -164,3 +168,7 @@ PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/squeeze_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
PD_REGISTER_KERNEL(squeeze_grad,
GPU,
ALL_LAYOUT,
phi::SqueezeGradKernel,
float,
double,
phi::dtype::bfloat16,
phi::dtype::float16,
bool,
int,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/squeeze_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
PD_REGISTER_KERNEL(squeeze,
GPU,
ALL_LAYOUT,
phi::SqueezeKernel,
float,
double,
phi::dtype::bfloat16,
phi::dtype::float16,
bool,
int,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
PD_REGISTER_KERNEL(unsqueeze_grad,
GPU,
ALL_LAYOUT,
phi::UnsqueezeGradKernel,
phi::dtype::bfloat16,
phi::dtype::float16,
bool,
int,
int16_t,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unsqueeze_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
PD_REGISTER_KERNEL(unsqueeze,
GPU,
ALL_LAYOUT,
phi::UnsqueezeKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
bool,
int,
int16_t,
uint8_t,
int8_t,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -275,4 +275,22 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
d_ddx);
}
template <typename T, typename Context>
void LogDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
const DenseTensor& ddx,
DenseTensor* dx,
DenseTensor* ddout) {
if (dx) {
dx->Resize(x.dims());
dev_ctx.template Alloc<T>(dx);
}
if (ddout) {
dev_ctx.template Alloc<T>(ddout);
}
funcs::LogGradGradFunctor<T> functor;
functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
namespace phi {
template <typename T, typename Context>
void SqueezeGradKernel(const Context& dev_ctx,
const DenseTensor& xshape,
const DenseTensor& dout,
const std::vector<int>& axes,
DenseTensor* dx) {
auto xshape_dims = xshape.dims();
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
dev_ctx.template Alloc<T>(dx);
phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
dx->Resize(x_dims);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
namespace phi {
template <typename T, typename Context>
void SqueezeKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& axes,
DenseTensor* xshape,
DenseTensor* out) {
auto x_dims = x.dims();
auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true);
dev_ctx.template Alloc<T>(out);
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(out_dims);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
namespace phi {
template <typename T, typename Context>
void UnsqueezeGradKernel(const Context& dev_ctx,
const DenseTensor& x_shape,
const DenseTensor& dout,
DenseTensor* dx) {
auto xshape_dims = x_shape.dims();
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
dev_ctx.template Alloc<T>(dx);
phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), true, dx);
dx->Resize(x_dims);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
namespace phi {
template <typename T, typename Context>
void UnsqueezeKernel(const Context& dev_ctx,
const DenseTensor& x,
const ScalarArray& axes,
DenseTensor* xshape,
DenseTensor* out) {
auto x_dims = x.dims();
auto out_dims = out->dims();
if (axes.FromTensor()) {
std::vector<int32_t> tmp;
tmp.reserve(axes.GetData().size());
std::for_each(axes.GetData().begin(),
axes.GetData().end(),
[&tmp](const int64_t& t) { tmp.push_back(t); });
out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
}
out->Resize(out_dims);
dev_ctx.template Alloc<T>(out);
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(out_dims); // copy will reset the dims.
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void SqueezeGradKernel(const Context& dev_ctx,
const DenseTensor& xshape,
const DenseTensor& dout,
const std::vector<int>& axes,
DenseTensor* dx);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void SqueezeKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& axes,
DenseTensor* xshape,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void UnsqueezeGradKernel(const Context& dev_ctx,
const DenseTensor& x_shape,
const DenseTensor& dout,
DenseTensor* dx);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void UnsqueezeKernel(const Context& dev_ctx,
const DenseTensor& x,
const ScalarArray& axes,
DenseTensor* xshape,
DenseTensor* out);
} // namespace phi
......@@ -57,6 +57,10 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log10, "log10", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log1p, "log1p", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT
......@@ -125,6 +129,12 @@ KernelSignature EluDoubleGradOpArgumentMapping(
"elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
}
KernelSignature LogDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"log_double_grad", {"X", "DOut", "DDX"}, {}, {"DX", "DDOut"});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
......@@ -134,6 +144,7 @@ PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad);
PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
......@@ -181,3 +192,8 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
phi::LogSigmoidGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
phi::HardSigmoidGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,21 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
double>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
int>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
int64_t>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
bool>);
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature FillConstantBatchSizeLikeOpArgumentMapping(
const ArgumentMappingContext& ctx) {
const auto& str_value = paddle::any_cast<std::string>(ctx.Attr("str_value"));
if (str_value.empty()) {
return KernelSignature(
"full_batch_size_like",
{"Input"},
{"shape", "value", "dtype", "input_dim_idx", "output_dim_idx"},
{"Out"});
} else {
return KernelSignature(
"full_batch_size_like",
{"Input"},
{"shape", "str_value", "dtype", "input_dim_idx", "output_dim_idx"},
{"Out"});
}
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(fill_constant_batch_size_like,
full_batch_size_like);
PD_REGISTER_ARG_MAPPING_FN(fill_constant_batch_size_like,
phi::FillConstantBatchSizeLikeOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("squeeze", {"X"}, {"axes"}, {"XShape", "Out"});
}
KernelSignature SqueezeGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("squeeze_grad",
{"XShape", GradVarName("Out")},
{"axes"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze);
PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad);
PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.InputSize("AxesTensorList") > 0) {
VLOG(2) << "unsqueeze2 in AxesTensorList";
return KernelSignature(
"unsqueeze", {"X"}, {"AxesTensorList"}, {"XShape", "Out"});
} else if (ctx.InputSize("AxesTensor") > 0) {
VLOG(2) << "unsqueeze2 in AxesTensor";
return KernelSignature(
"unsqueeze", {"X"}, {"AxesTensor"}, {"XShape", "Out"});
} else {
VLOG(2) << "unsqueeze2 in axes";
return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"XShape", "Out"});
}
}
KernelSignature UnsqueezeGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"unsqueeze_grad", {"XShape", GradVarName("Out")}, {}, {GradVarName("X")});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze);
PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad);
PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad,
phi::UnsqueezeGradOpArgumentMapping);
......@@ -774,12 +774,12 @@ set +x
get_precision_ut_mac
ut_actual_total_startTime_s=`date +%s`
if [[ "$on_precision" == "0" ]];then
ctest -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
ctest -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
else
ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
tmpfile_rand=`date +%s%N`
tmpfile=$tmp_dir/$tmpfile_rand
ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
fi
ut_total_endTime_s=`date +%s`
echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
......@@ -848,7 +848,7 @@ set +x
fi
done
failed_test_lists=''
ctest -R "$retry_unittests_regular" --output-on-failure -j 2 | tee $tmpfile
ctest -R "$retry_unittests_regular" --timeout 120 --output-on-failure -j 2 | tee $tmpfile
collect_failed_tests
rm -f $tmp_dir/*
exec_times=$[$exec_times+1]
......
......@@ -59,6 +59,13 @@ class TestScope(unittest.TestCase):
# It is not allowed to delete a nonexistent scope.
scope._remove_from_pool()
def test_size(self):
paddle_c = paddle.fluid.core
scope = paddle_c.Scope()
var_a = scope.var("var_a")
self.assertEqual(scope.size(), 1)
self.assertIsNotNone(scope.find_var('var_a'))
if __name__ == '__main__':
unittest.main()
......@@ -148,7 +148,6 @@ def source_include(header_file_path):
#include "paddle/phi/infermeta/nullary.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/infermeta/ternary.h"
#include "paddle/phi/kernels/declarations.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
"""
......
......@@ -231,6 +231,12 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
check_approval 1 6888866 39303645
fi
HAS_MODIFIED_DECLARATIONS=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels/declarations.h" || true`
if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
echo_line="You must be approved by chenwhql for any use of paddle/phi/kernels/declarations.h. Thanks!\n"
check_approval 1 22561442
fi
ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册